1//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Loop unrolling may create many similar GEPs for array accesses.
10// e.g., a 2-level loop
11//
12// float a[32][32]; // global variable
13//
14// for (int i = 0; i < 2; ++i) {
15//   for (int j = 0; j < 2; ++j) {
16//     ...
17//     ... = a[x + i][y + j];
18//     ...
19//   }
20// }
21//
22// will probably be unrolled to:
23//
24// gep %a, 0, %x, %y; load
25// gep %a, 0, %x, %y + 1; load
26// gep %a, 0, %x + 1, %y; load
27// gep %a, 0, %x + 1, %y + 1; load
28//
29// LLVM's GVN does not use partial redundancy elimination yet, and is thus
30// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
31// significant slowdown in targets with limited addressing modes. For instance,
32// because the PTX target does not support the reg+reg addressing mode, the
33// NVPTX backend emits PTX code that literally computes the pointer address of
34// each GEP, wasting tons of registers. It emits the following PTX for the
35// first load and similar PTX for other loads.
36//
37// mov.u32         %r1, %x;
38// mov.u32         %r2, %y;
39// mul.wide.u32    %rl2, %r1, 128;
40// mov.u64         %rl3, a;
41// add.s64         %rl4, %rl3, %rl2;
42// mul.wide.u32    %rl5, %r2, 4;
43// add.s64         %rl6, %rl4, %rl5;
44// ld.global.f32   %f1, [%rl6];
45//
46// To reduce the register pressure, the optimization implemented in this file
47// merges the common part of a group of GEPs, so we can compute each pointer
48// address by adding a simple offset to the common part, saving many registers.
49//
50// It works by splitting each GEP into a variadic base and a constant offset.
51// The variadic base can be computed once and reused by multiple GEPs, and the
52// constant offsets can be nicely folded into the reg+immediate addressing mode
53// (supported by most targets) without using any extra register.
54//
55// For instance, we transform the four GEPs and four loads in the above example
56// into:
57//
58// base = gep a, 0, x, y
59// load base
60// laod base + 1  * sizeof(float)
61// load base + 32 * sizeof(float)
62// load base + 33 * sizeof(float)
63//
64// Given the transformed IR, a backend that supports the reg+immediate
65// addressing mode can easily fold the pointer arithmetics into the loads. For
66// example, the NVPTX backend can easily fold the pointer arithmetics into the
67// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
68//
69// mov.u32         %r1, %tid.x;
70// mov.u32         %r2, %tid.y;
71// mul.wide.u32    %rl2, %r1, 128;
72// mov.u64         %rl3, a;
73// add.s64         %rl4, %rl3, %rl2;
74// mul.wide.u32    %rl5, %r2, 4;
75// add.s64         %rl6, %rl4, %rl5;
76// ld.global.f32   %f1, [%rl6]; // so far the same as unoptimized PTX
77// ld.global.f32   %f2, [%rl6+4]; // much better
78// ld.global.f32   %f3, [%rl6+128]; // much better
79// ld.global.f32   %f4, [%rl6+132]; // much better
80//
81// Another improvement enabled by the LowerGEP flag is to lower a GEP with
82// multiple indices to either multiple GEPs with a single index or arithmetic
83// operations (depending on whether the target uses alias analysis in codegen).
84// Such transformation can have following benefits:
85// (1) It can always extract constants in the indices of structure type.
86// (2) After such Lowering, there are more optimization opportunities such as
87//     CSE, LICM and CGP.
88//
89// E.g. The following GEPs have multiple indices:
90//  BB1:
91//    %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
92//    load %p
93//    ...
94//  BB2:
95//    %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
96//    load %p2
97//    ...
98//
99// We can not do CSE to the common part related to index "i64 %i". Lowering
100// GEPs can achieve such goals.
101// If the target does not use alias analysis in codegen, this pass will
102// lower a GEP with multiple indices into arithmetic operations:
103//  BB1:
104//    %1 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
105//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
106//    %3 = add i64 %1, %2                          ; CSE opportunity
107//    %4 = mul i64 %j1, length_of_struct
108//    %5 = add i64 %3, %4
109//    %6 = add i64 %3, struct_field_3              ; Constant offset
110//    %p = inttoptr i64 %6 to i32*
111//    load %p
112//    ...
113//  BB2:
114//    %7 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
115//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
116//    %9 = add i64 %7, %8                          ; CSE opportunity
117//    %10 = mul i64 %j2, length_of_struct
118//    %11 = add i64 %9, %10
119//    %12 = add i64 %11, struct_field_2            ; Constant offset
120//    %p = inttoptr i64 %12 to i32*
121//    load %p2
122//    ...
123//
124// If the target uses alias analysis in codegen, this pass will lower a GEP
125// with multiple indices into multiple GEPs with a single index:
126//  BB1:
127//    %1 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
128//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
129//    %3 = getelementptr i8* %1, i64 %2            ; CSE opportunity
130//    %4 = mul i64 %j1, length_of_struct
131//    %5 = getelementptr i8* %3, i64 %4
132//    %6 = getelementptr i8* %5, struct_field_3    ; Constant offset
133//    %p = bitcast i8* %6 to i32*
134//    load %p
135//    ...
136//  BB2:
137//    %7 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
138//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
139//    %9 = getelementptr i8* %7, i64 %8            ; CSE opportunity
140//    %10 = mul i64 %j2, length_of_struct
141//    %11 = getelementptr i8* %9, i64 %10
142//    %12 = getelementptr i8* %11, struct_field_2  ; Constant offset
143//    %p2 = bitcast i8* %12 to i32*
144//    load %p2
145//    ...
146//
147// Lowering GEPs can also benefit other passes such as LICM and CGP.
148// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
149// indices if one of the index is variant. If we lower such GEP into invariant
150// parts and variant parts, LICM can hoist/sink those invariant parts.
151// CGP (CodeGen Prepare) tries to sink address calculations that match the
152// target's addressing modes. A GEP with multiple indices may not match and will
153// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
154// them. So we end up with a better addressing mode.
155//
156//===----------------------------------------------------------------------===//
157
158#include "llvm/ADT/APInt.h"
159#include "llvm/ADT/DenseMap.h"
160#include "llvm/ADT/DepthFirstIterator.h"
161#include "llvm/ADT/SmallVector.h"
162#include "llvm/Analysis/LoopInfo.h"
163#include "llvm/Analysis/MemoryBuiltins.h"
164#include "llvm/Analysis/ScalarEvolution.h"
165#include "llvm/Analysis/TargetLibraryInfo.h"
166#include "llvm/Analysis/TargetTransformInfo.h"
167#include "llvm/Analysis/ValueTracking.h"
168#include "llvm/IR/BasicBlock.h"
169#include "llvm/IR/Constant.h"
170#include "llvm/IR/Constants.h"
171#include "llvm/IR/DataLayout.h"
172#include "llvm/IR/DerivedTypes.h"
173#include "llvm/IR/Dominators.h"
174#include "llvm/IR/Function.h"
175#include "llvm/IR/GetElementPtrTypeIterator.h"
176#include "llvm/IR/IRBuilder.h"
177#include "llvm/IR/Instruction.h"
178#include "llvm/IR/Instructions.h"
179#include "llvm/IR/Module.h"
180#include "llvm/IR/PatternMatch.h"
181#include "llvm/IR/Type.h"
182#include "llvm/IR/User.h"
183#include "llvm/IR/Value.h"
184#include "llvm/InitializePasses.h"
185#include "llvm/Pass.h"
186#include "llvm/Support/Casting.h"
187#include "llvm/Support/CommandLine.h"
188#include "llvm/Support/ErrorHandling.h"
189#include "llvm/Support/raw_ostream.h"
190#include "llvm/Target/TargetMachine.h"
191#include "llvm/Transforms/Scalar.h"
192#include "llvm/Transforms/Utils/Local.h"
193#include <cassert>
194#include <cstdint>
195#include <string>
196
197using namespace llvm;
198using namespace llvm::PatternMatch;
199
200static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
201    "disable-separate-const-offset-from-gep", cl::init(false),
202    cl::desc("Do not separate the constant offset from a GEP instruction"),
203    cl::Hidden);
204
205// Setting this flag may emit false positives when the input module already
206// contains dead instructions. Therefore, we set it only in unit tests that are
207// free of dead code.
208static cl::opt<bool>
209    VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
210                     cl::desc("Verify this pass produces no dead code"),
211                     cl::Hidden);
212
213namespace {
214
215/// A helper class for separating a constant offset from a GEP index.
216///
217/// In real programs, a GEP index may be more complicated than a simple addition
218/// of something and a constant integer which can be trivially splitted. For
219/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
220/// constant offset, so that we can separate the index to (a << 3) + b and 5.
221///
222/// Therefore, this class looks into the expression that computes a given GEP
223/// index, and tries to find a constant integer that can be hoisted to the
224/// outermost level of the expression as an addition. Not every constant in an
225/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
226/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
227/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
228class ConstantOffsetExtractor {
229public:
230  /// Extracts a constant offset from the given GEP index. It returns the
231  /// new index representing the remainder (equal to the original index minus
232  /// the constant offset), or nullptr if we cannot extract a constant offset.
233  /// \p Idx The given GEP index
234  /// \p GEP The given GEP
235  /// \p UserChainTail Outputs the tail of UserChain so that we can
236  ///                  garbage-collect unused instructions in UserChain.
237  static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
238                        User *&UserChainTail, const DominatorTree *DT);
239
240  /// Looks for a constant offset from the given GEP index without extracting
241  /// it. It returns the numeric value of the extracted constant offset (0 if
242  /// failed). The meaning of the arguments are the same as Extract.
243  static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
244                      const DominatorTree *DT);
245
246private:
247  ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
248      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
249  }
250
251  /// Searches the expression that computes V for a non-zero constant C s.t.
252  /// V can be reassociated into the form V' + C. If the searching is
253  /// successful, returns C and update UserChain as a def-use chain from C to V;
254  /// otherwise, UserChain is empty.
255  ///
256  /// \p V            The given expression
257  /// \p SignExtended Whether V will be sign-extended in the computation of the
258  ///                 GEP index
259  /// \p ZeroExtended Whether V will be zero-extended in the computation of the
260  ///                 GEP index
261  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
262  ///                 an index of an inbounds GEP is guaranteed to be
263  ///                 non-negative. Levaraging this, we can better split
264  ///                 inbounds GEPs.
265  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
266
267  /// A helper function to look into both operands of a binary operator.
268  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
269                            bool ZeroExtended);
270
271  /// After finding the constant offset C from the GEP index I, we build a new
272  /// index I' s.t. I' + C = I. This function builds and returns the new
273  /// index I' according to UserChain produced by function "find".
274  ///
275  /// The building conceptually takes two steps:
276  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
277  /// that computes I
278  /// 2) reassociate the expression tree to the form I' + C.
279  ///
280  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
281  /// sext to a, b and 5 so that we have
282  ///   sext(a) + (sext(b) + 5).
283  /// Then, we reassociate it to
284  ///   (sext(a) + sext(b)) + 5.
285  /// Given this form, we know I' is sext(a) + sext(b).
286  Value *rebuildWithoutConstOffset();
287
288  /// After the first step of rebuilding the GEP index without the constant
289  /// offset, distribute s/zext to the operands of all operators in UserChain.
290  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
291  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
292  ///
293  /// The function also updates UserChain to point to new subexpressions after
294  /// distributing s/zext. e.g., the old UserChain of the above example is
295  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
296  /// and the new UserChain is
297  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
298  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
299  ///
300  /// \p ChainIndex The index to UserChain. ChainIndex is initially
301  ///               UserChain.size() - 1, and is decremented during
302  ///               the recursion.
303  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
304
305  /// Reassociates the GEP index to the form I' + C and returns I'.
306  Value *removeConstOffset(unsigned ChainIndex);
307
308  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
309  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
310  /// returns "sext i32 (zext i16 V to i32) to i64".
311  Value *applyExts(Value *V);
312
313  /// A helper function that returns whether we can trace into the operands
314  /// of binary operator BO for a constant offset.
315  ///
316  /// \p SignExtended Whether BO is surrounded by sext
317  /// \p ZeroExtended Whether BO is surrounded by zext
318  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
319  ///                array index.
320  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
321                    bool NonNegative);
322
323  /// The path from the constant offset to the old GEP index. e.g., if the GEP
324  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
325  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
326  /// UserChain[2] will be the entire expression "a * b + (c + 5)".
327  ///
328  /// This path helps to rebuild the new GEP index.
329  SmallVector<User *, 8> UserChain;
330
331  /// A data structure used in rebuildWithoutConstOffset. Contains all
332  /// sext/zext instructions along UserChain.
333  SmallVector<CastInst *, 16> ExtInsts;
334
335  /// Insertion position of cloned instructions.
336  Instruction *IP;
337
338  const DataLayout &DL;
339  const DominatorTree *DT;
340};
341
342/// A pass that tries to split every GEP in the function into a variadic
343/// base and a constant offset. It is a FunctionPass because searching for the
344/// constant offset may inspect other basic blocks.
345class SeparateConstOffsetFromGEP : public FunctionPass {
346public:
347  static char ID;
348
349  SeparateConstOffsetFromGEP(bool LowerGEP = false)
350      : FunctionPass(ID), LowerGEP(LowerGEP) {
351    initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
352  }
353
354  void getAnalysisUsage(AnalysisUsage &AU) const override {
355    AU.addRequired<DominatorTreeWrapperPass>();
356    AU.addRequired<ScalarEvolutionWrapperPass>();
357    AU.addRequired<TargetTransformInfoWrapperPass>();
358    AU.addRequired<LoopInfoWrapperPass>();
359    AU.setPreservesCFG();
360    AU.addRequired<TargetLibraryInfoWrapperPass>();
361  }
362
363  bool doInitialization(Module &M) override {
364    DL = &M.getDataLayout();
365    return false;
366  }
367
368  bool runOnFunction(Function &F) override;
369
370private:
371  /// Tries to split the given GEP into a variadic base and a constant offset,
372  /// and returns true if the splitting succeeds.
373  bool splitGEP(GetElementPtrInst *GEP);
374
375  /// Lower a GEP with multiple indices into multiple GEPs with a single index.
376  /// Function splitGEP already split the original GEP into a variadic part and
377  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
378  /// variadic part into a set of GEPs with a single index and applies
379  /// AccumulativeByteOffset to it.
380  /// \p Variadic                  The variadic part of the original GEP.
381  /// \p AccumulativeByteOffset    The constant offset.
382  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
383                              int64_t AccumulativeByteOffset);
384
385  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
386  /// Function splitGEP already split the original GEP into a variadic part and
387  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
388  /// variadic part into a set of arithmetic operations and applies
389  /// AccumulativeByteOffset to it.
390  /// \p Variadic                  The variadic part of the original GEP.
391  /// \p AccumulativeByteOffset    The constant offset.
392  void lowerToArithmetics(GetElementPtrInst *Variadic,
393                          int64_t AccumulativeByteOffset);
394
395  /// Finds the constant offset within each index and accumulates them. If
396  /// LowerGEP is true, it finds in indices of both sequential and structure
397  /// types, otherwise it only finds in sequential indices. The output
398  /// NeedsExtraction indicates whether we successfully find a non-zero constant
399  /// offset.
400  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
401
402  /// Canonicalize array indices to pointer-size integers. This helps to
403  /// simplify the logic of splitting a GEP. For example, if a + b is a
404  /// pointer-size integer, we have
405  ///   gep base, a + b = gep (gep base, a), b
406  /// However, this equality may not hold if the size of a + b is smaller than
407  /// the pointer size, because LLVM conceptually sign-extends GEP indices to
408  /// pointer size before computing the address
409  /// (http://llvm.org/docs/LangRef.html#id181).
410  ///
411  /// This canonicalization is very likely already done in clang and
412  /// instcombine. Therefore, the program will probably remain the same.
413  ///
414  /// Returns true if the module changes.
415  ///
416  /// Verified in @i32_add in split-gep.ll
417  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
418
419  /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
420  /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
421  /// the constant offset. After extraction, it becomes desirable to reunion the
422  /// distributed sexts. For example,
423  ///
424  ///                              &a[sext(i +nsw (j +nsw 5)]
425  ///   => distribute              &a[sext(i) +nsw (sext(j) +nsw 5)]
426  ///   => constant extraction     &a[sext(i) + sext(j)] + 5
427  ///   => reunion                 &a[sext(i +nsw j)] + 5
428  bool reuniteExts(Function &F);
429
430  /// A helper that reunites sexts in an instruction.
431  bool reuniteExts(Instruction *I);
432
433  /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
434  Instruction *findClosestMatchingDominator(const SCEV *Key,
435                                            Instruction *Dominatee);
436  /// Verify F is free of dead code.
437  void verifyNoDeadCode(Function &F);
438
439  bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
440
441  // Swap the index operand of two GEP.
442  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
443
444  // Check if it is safe to swap operand of two GEP.
445  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
446                            Loop *CurLoop);
447
448  const DataLayout *DL = nullptr;
449  DominatorTree *DT = nullptr;
450  ScalarEvolution *SE;
451
452  LoopInfo *LI;
453  TargetLibraryInfo *TLI;
454
455  /// Whether to lower a GEP with multiple indices into arithmetic operations or
456  /// multiple GEPs with a single index.
457  bool LowerGEP;
458
459  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
460};
461
462} // end anonymous namespace
463
464char SeparateConstOffsetFromGEP::ID = 0;
465
466INITIALIZE_PASS_BEGIN(
467    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
468    "Split GEPs to a variadic base and a constant offset for better CSE", false,
469    false)
470INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
471INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
472INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
473INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
474INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
475INITIALIZE_PASS_END(
476    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
477    "Split GEPs to a variadic base and a constant offset for better CSE", false,
478    false)
479
480FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
481  return new SeparateConstOffsetFromGEP(LowerGEP);
482}
483
484bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
485                                            bool ZeroExtended,
486                                            BinaryOperator *BO,
487                                            bool NonNegative) {
488  // We only consider ADD, SUB and OR, because a non-zero constant found in
489  // expressions composed of these operations can be easily hoisted as a
490  // constant offset by reassociation.
491  if (BO->getOpcode() != Instruction::Add &&
492      BO->getOpcode() != Instruction::Sub &&
493      BO->getOpcode() != Instruction::Or) {
494    return false;
495  }
496
497  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
498  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
499  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
500  // FIXME: this does not appear to be covered by any tests
501  //        (with x86/aarch64 backends at least)
502  if (BO->getOpcode() == Instruction::Or &&
503      !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
504    return false;
505
506  // In addition, tracing into BO requires that its surrounding s/zext (if
507  // any) is distributable to both operands.
508  //
509  // Suppose BO = A op B.
510  //  SignExtended | ZeroExtended | Distributable?
511  // --------------+--------------+----------------------------------
512  //       0       |      0       | true because no s/zext exists
513  //       0       |      1       | zext(BO) == zext(A) op zext(B)
514  //       1       |      0       | sext(BO) == sext(A) op sext(B)
515  //       1       |      1       | zext(sext(BO)) ==
516  //               |              |     zext(sext(A)) op zext(sext(B))
517  if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
518    // If a + b >= 0 and (a >= 0 or b >= 0), then
519    //   sext(a + b) = sext(a) + sext(b)
520    // even if the addition is not marked nsw.
521    //
522    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
523    // index if the constant offset is non-negative.
524    //
525    // Verified in @sext_add in split-gep.ll.
526    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
527      if (!ConstLHS->isNegative())
528        return true;
529    }
530    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
531      if (!ConstRHS->isNegative())
532        return true;
533    }
534  }
535
536  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
537  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
538  if (BO->getOpcode() == Instruction::Add ||
539      BO->getOpcode() == Instruction::Sub) {
540    if (SignExtended && !BO->hasNoSignedWrap())
541      return false;
542    if (ZeroExtended && !BO->hasNoUnsignedWrap())
543      return false;
544  }
545
546  return true;
547}
548
549APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
550                                                   bool SignExtended,
551                                                   bool ZeroExtended) {
552  // BO being non-negative does not shed light on whether its operands are
553  // non-negative. Clear the NonNegative flag here.
554  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
555                              /* NonNegative */ false);
556  // If we found a constant offset in the left operand, stop and return that.
557  // This shortcut might cause us to miss opportunities of combining the
558  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
559  // However, such cases are probably already handled by -instcombine,
560  // given this pass runs after the standard optimizations.
561  if (ConstantOffset != 0) return ConstantOffset;
562  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
563                        /* NonNegative */ false);
564  // If U is a sub operator, negate the constant offset found in the right
565  // operand.
566  if (BO->getOpcode() == Instruction::Sub)
567    ConstantOffset = -ConstantOffset;
568  return ConstantOffset;
569}
570
571APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
572                                    bool ZeroExtended, bool NonNegative) {
573  // TODO(jingyue): We could trace into integer/pointer casts, such as
574  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
575  // integers because it gives good enough results for our benchmarks.
576  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
577
578  // We cannot do much with Values that are not a User, such as an Argument.
579  User *U = dyn_cast<User>(V);
580  if (U == nullptr) return APInt(BitWidth, 0);
581
582  APInt ConstantOffset(BitWidth, 0);
583  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
584    // Hooray, we found it!
585    ConstantOffset = CI->getValue();
586  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
587    // Trace into subexpressions for more hoisting opportunities.
588    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
589      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
590  } else if (isa<TruncInst>(V)) {
591    ConstantOffset =
592        find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
593            .trunc(BitWidth);
594  } else if (isa<SExtInst>(V)) {
595    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
596                          ZeroExtended, NonNegative).sext(BitWidth);
597  } else if (isa<ZExtInst>(V)) {
598    // As an optimization, we can clear the SignExtended flag because
599    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
600    //
601    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
602    ConstantOffset =
603        find(U->getOperand(0), /* SignExtended */ false,
604             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
605  }
606
607  // If we found a non-zero constant offset, add it to the path for
608  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
609  // help this optimization.
610  if (ConstantOffset != 0)
611    UserChain.push_back(U);
612  return ConstantOffset;
613}
614
615Value *ConstantOffsetExtractor::applyExts(Value *V) {
616  Value *Current = V;
617  // ExtInsts is built in the use-def order. Therefore, we apply them to V
618  // in the reversed order.
619  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
620    if (Constant *C = dyn_cast<Constant>(Current)) {
621      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
622      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
623      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
624    } else {
625      Instruction *Ext = (*I)->clone();
626      Ext->setOperand(0, Current);
627      Ext->insertBefore(IP);
628      Current = Ext;
629    }
630  }
631  return Current;
632}
633
634Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
635  distributeExtsAndCloneChain(UserChain.size() - 1);
636  // Remove all nullptrs (used to be s/zext) from UserChain.
637  unsigned NewSize = 0;
638  for (User *I : UserChain) {
639    if (I != nullptr) {
640      UserChain[NewSize] = I;
641      NewSize++;
642    }
643  }
644  UserChain.resize(NewSize);
645  return removeConstOffset(UserChain.size() - 1);
646}
647
648Value *
649ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
650  User *U = UserChain[ChainIndex];
651  if (ChainIndex == 0) {
652    assert(isa<ConstantInt>(U));
653    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
654    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
655  }
656
657  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
658    assert(
659        (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
660        "Only following instructions can be traced: sext, zext & trunc");
661    ExtInsts.push_back(Cast);
662    UserChain[ChainIndex] = nullptr;
663    return distributeExtsAndCloneChain(ChainIndex - 1);
664  }
665
666  // Function find only trace into BinaryOperator and CastInst.
667  BinaryOperator *BO = cast<BinaryOperator>(U);
668  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
669  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
670  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
671  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
672
673  BinaryOperator *NewBO = nullptr;
674  if (OpNo == 0) {
675    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
676                                   BO->getName(), IP);
677  } else {
678    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
679                                   BO->getName(), IP);
680  }
681  return UserChain[ChainIndex] = NewBO;
682}
683
684Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
685  if (ChainIndex == 0) {
686    assert(isa<ConstantInt>(UserChain[ChainIndex]));
687    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
688  }
689
690  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
691  assert(BO->getNumUses() <= 1 &&
692         "distributeExtsAndCloneChain clones each BinaryOperator in "
693         "UserChain, so no one should be used more than "
694         "once");
695
696  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
697  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
698  Value *NextInChain = removeConstOffset(ChainIndex - 1);
699  Value *TheOther = BO->getOperand(1 - OpNo);
700
701  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
702  // sub-expression to be just TheOther.
703  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
704    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
705      return TheOther;
706  }
707
708  BinaryOperator::BinaryOps NewOp = BO->getOpcode();
709  if (BO->getOpcode() == Instruction::Or) {
710    // Rebuild "or" as "add", because "or" may be invalid for the new
711    // expression.
712    //
713    // For instance, given
714    //   a | (b + 5) where a and b + 5 have no common bits,
715    // we can extract 5 as the constant offset.
716    //
717    // However, reusing the "or" in the new index would give us
718    //   (a | b) + 5
719    // which does not equal a | (b + 5).
720    //
721    // Replacing the "or" with "add" is fine, because
722    //   a | (b + 5) = a + (b + 5) = (a + b) + 5
723    NewOp = Instruction::Add;
724  }
725
726  BinaryOperator *NewBO;
727  if (OpNo == 0) {
728    NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
729  } else {
730    NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
731  }
732  NewBO->takeName(BO);
733  return NewBO;
734}
735
736Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
737                                        User *&UserChainTail,
738                                        const DominatorTree *DT) {
739  ConstantOffsetExtractor Extractor(GEP, DT);
740  // Find a non-zero constant offset first.
741  APInt ConstantOffset =
742      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
743                     GEP->isInBounds());
744  if (ConstantOffset == 0) {
745    UserChainTail = nullptr;
746    return nullptr;
747  }
748  // Separates the constant offset from the GEP index.
749  Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
750  UserChainTail = Extractor.UserChain.back();
751  return IdxWithoutConstOffset;
752}
753
754int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
755                                      const DominatorTree *DT) {
756  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
757  return ConstantOffsetExtractor(GEP, DT)
758      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
759            GEP->isInBounds())
760      .getSExtValue();
761}
762
763bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
764    GetElementPtrInst *GEP) {
765  bool Changed = false;
766  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
767  gep_type_iterator GTI = gep_type_begin(*GEP);
768  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
769       I != E; ++I, ++GTI) {
770    // Skip struct member indices which must be i32.
771    if (GTI.isSequential()) {
772      if ((*I)->getType() != IntPtrTy) {
773        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
774        Changed = true;
775      }
776    }
777  }
778  return Changed;
779}
780
781int64_t
782SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
783                                                 bool &NeedsExtraction) {
784  NeedsExtraction = false;
785  int64_t AccumulativeByteOffset = 0;
786  gep_type_iterator GTI = gep_type_begin(*GEP);
787  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
788    if (GTI.isSequential()) {
789      // Tries to extract a constant offset from this GEP index.
790      int64_t ConstantOffset =
791          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
792      if (ConstantOffset != 0) {
793        NeedsExtraction = true;
794        // A GEP may have multiple indices.  We accumulate the extracted
795        // constant offset to a byte offset, and later offset the remainder of
796        // the original GEP with this byte offset.
797        AccumulativeByteOffset +=
798            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
799      }
800    } else if (LowerGEP) {
801      StructType *StTy = GTI.getStructType();
802      uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
803      // Skip field 0 as the offset is always 0.
804      if (Field != 0) {
805        NeedsExtraction = true;
806        AccumulativeByteOffset +=
807            DL->getStructLayout(StTy)->getElementOffset(Field);
808      }
809    }
810  }
811  return AccumulativeByteOffset;
812}
813
814void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
815    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
816  IRBuilder<> Builder(Variadic);
817  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
818
819  Type *I8PtrTy =
820      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
821  Value *ResultPtr = Variadic->getOperand(0);
822  Loop *L = LI->getLoopFor(Variadic->getParent());
823  // Check if the base is not loop invariant or used more than once.
824  bool isSwapCandidate =
825      L && L->isLoopInvariant(ResultPtr) &&
826      !hasMoreThanOneUseInLoop(ResultPtr, L);
827  Value *FirstResult = nullptr;
828
829  if (ResultPtr->getType() != I8PtrTy)
830    ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
831
832  gep_type_iterator GTI = gep_type_begin(*Variadic);
833  // Create an ugly GEP for each sequential index. We don't create GEPs for
834  // structure indices, as they are accumulated in the constant offset index.
835  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
836    if (GTI.isSequential()) {
837      Value *Idx = Variadic->getOperand(I);
838      // Skip zero indices.
839      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
840        if (CI->isZero())
841          continue;
842
843      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
844                                DL->getTypeAllocSize(GTI.getIndexedType()));
845      // Scale the index by element size.
846      if (ElementSize != 1) {
847        if (ElementSize.isPowerOf2()) {
848          Idx = Builder.CreateShl(
849              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
850        } else {
851          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
852        }
853      }
854      // Create an ugly GEP with a single index for each index.
855      ResultPtr =
856          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
857      if (FirstResult == nullptr)
858        FirstResult = ResultPtr;
859    }
860  }
861
862  // Create a GEP with the constant offset index.
863  if (AccumulativeByteOffset != 0) {
864    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
865    ResultPtr =
866        Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
867  } else
868    isSwapCandidate = false;
869
870  // If we created a GEP with constant index, and the base is loop invariant,
871  // then we swap the first one with it, so LICM can move constant GEP out
872  // later.
873  GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
874  GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr);
875  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
876    swapGEPOperand(FirstGEP, SecondGEP);
877
878  if (ResultPtr->getType() != Variadic->getType())
879    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
880
881  Variadic->replaceAllUsesWith(ResultPtr);
882  Variadic->eraseFromParent();
883}
884
885void
886SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
887                                               int64_t AccumulativeByteOffset) {
888  IRBuilder<> Builder(Variadic);
889  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
890
891  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
892  gep_type_iterator GTI = gep_type_begin(*Variadic);
893  // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
894  // don't create arithmetics for structure indices, as they are accumulated
895  // in the constant offset index.
896  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
897    if (GTI.isSequential()) {
898      Value *Idx = Variadic->getOperand(I);
899      // Skip zero indices.
900      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
901        if (CI->isZero())
902          continue;
903
904      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
905                                DL->getTypeAllocSize(GTI.getIndexedType()));
906      // Scale the index by element size.
907      if (ElementSize != 1) {
908        if (ElementSize.isPowerOf2()) {
909          Idx = Builder.CreateShl(
910              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
911        } else {
912          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
913        }
914      }
915      // Create an ADD for each index.
916      ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
917    }
918  }
919
920  // Create an ADD for the constant offset index.
921  if (AccumulativeByteOffset != 0) {
922    ResultPtr = Builder.CreateAdd(
923        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
924  }
925
926  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
927  Variadic->replaceAllUsesWith(ResultPtr);
928  Variadic->eraseFromParent();
929}
930
931bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
932  // Skip vector GEPs.
933  if (GEP->getType()->isVectorTy())
934    return false;
935
936  // The backend can already nicely handle the case where all indices are
937  // constant.
938  if (GEP->hasAllConstantIndices())
939    return false;
940
941  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
942
943  bool NeedsExtraction;
944  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
945
946  if (!NeedsExtraction)
947    return Changed;
948
949  TargetTransformInfo &TTI =
950      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*GEP->getFunction());
951
952  // If LowerGEP is disabled, before really splitting the GEP, check whether the
953  // backend supports the addressing mode we are about to produce. If no, this
954  // splitting probably won't be beneficial.
955  // If LowerGEP is enabled, even the extracted constant offset can not match
956  // the addressing mode, we can still do optimizations to other lowered parts
957  // of variable indices. Therefore, we don't check for addressing modes in that
958  // case.
959  if (!LowerGEP) {
960    unsigned AddrSpace = GEP->getPointerAddressSpace();
961    if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
962                                   /*BaseGV=*/nullptr, AccumulativeByteOffset,
963                                   /*HasBaseReg=*/true, /*Scale=*/0,
964                                   AddrSpace)) {
965      return Changed;
966    }
967  }
968
969  // Remove the constant offset in each sequential index. The resultant GEP
970  // computes the variadic base.
971  // Notice that we don't remove struct field indices here. If LowerGEP is
972  // disabled, a structure index is not accumulated and we still use the old
973  // one. If LowerGEP is enabled, a structure index is accumulated in the
974  // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
975  // handle the constant offset and won't need a new structure index.
976  gep_type_iterator GTI = gep_type_begin(*GEP);
977  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
978    if (GTI.isSequential()) {
979      // Splits this GEP index into a variadic part and a constant offset, and
980      // uses the variadic part as the new index.
981      Value *OldIdx = GEP->getOperand(I);
982      User *UserChainTail;
983      Value *NewIdx =
984          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
985      if (NewIdx != nullptr) {
986        // Switches to the index with the constant offset removed.
987        GEP->setOperand(I, NewIdx);
988        // After switching to the new index, we can garbage-collect UserChain
989        // and the old index if they are not used.
990        RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
991        RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
992      }
993    }
994  }
995
996  // Clear the inbounds attribute because the new index may be off-bound.
997  // e.g.,
998  //
999  //   b     = add i64 a, 5
1000  //   addr  = gep inbounds float, float* p, i64 b
1001  //
1002  // is transformed to:
1003  //
1004  //   addr2 = gep float, float* p, i64 a ; inbounds removed
1005  //   addr  = gep inbounds float, float* addr2, i64 5
1006  //
1007  // If a is -4, although the old index b is in bounds, the new index a is
1008  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
1009  // inbounds keyword is not present, the offsets are added to the base
1010  // address with silently-wrapping two's complement arithmetic".
1011  // Therefore, the final code will be a semantically equivalent.
1012  //
1013  // TODO(jingyue): do some range analysis to keep as many inbounds as
1014  // possible. GEPs with inbounds are more friendly to alias analysis.
1015  bool GEPWasInBounds = GEP->isInBounds();
1016  GEP->setIsInBounds(false);
1017
1018  // Lowers a GEP to either GEPs with a single index or arithmetic operations.
1019  if (LowerGEP) {
1020    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
1021    // arithmetic operations if the target uses alias analysis in codegen.
1022    if (TTI.useAA())
1023      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
1024    else
1025      lowerToArithmetics(GEP, AccumulativeByteOffset);
1026    return true;
1027  }
1028
1029  // No need to create another GEP if the accumulative byte offset is 0.
1030  if (AccumulativeByteOffset == 0)
1031    return true;
1032
1033  // Offsets the base with the accumulative byte offset.
1034  //
1035  //   %gep                        ; the base
1036  //   ... %gep ...
1037  //
1038  // => add the offset
1039  //
1040  //   %gep2                       ; clone of %gep
1041  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
1042  //   %gep                        ; will be removed
1043  //   ... %gep ...
1044  //
1045  // => replace all uses of %gep with %new.gep and remove %gep
1046  //
1047  //   %gep2                       ; clone of %gep
1048  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
1049  //   ... %new.gep ...
1050  //
1051  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
1052  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
1053  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
1054  // type of %gep.
1055  //
1056  //   %gep2                       ; clone of %gep
1057  //   %0       = bitcast %gep2 to i8*
1058  //   %uglygep = gep %0, <offset>
1059  //   %new.gep = bitcast %uglygep to <type of %gep>
1060  //   ... %new.gep ...
1061  Instruction *NewGEP = GEP->clone();
1062  NewGEP->insertBefore(GEP);
1063
1064  // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
1065  // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
1066  // used with unsigned integers later.
1067  int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
1068      DL->getTypeAllocSize(GEP->getResultElementType()));
1069  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
1070  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
1071    // Very likely. As long as %gep is naturally aligned, the byte offset we
1072    // extracted should be a multiple of sizeof(*%gep).
1073    int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
1074    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
1075                                       ConstantInt::get(IntPtrTy, Index, true),
1076                                       GEP->getName(), GEP);
1077    NewGEP->copyMetadata(*GEP);
1078    // Inherit the inbounds attribute of the original GEP.
1079    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
1080  } else {
1081    // Unlikely but possible. For example,
1082    // #pragma pack(1)
1083    // struct S {
1084    //   int a[3];
1085    //   int64 b[8];
1086    // };
1087    // #pragma pack()
1088    //
1089    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
1090    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
1091    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
1092    // sizeof(int64).
1093    //
1094    // Emit an uglygep in this case.
1095    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
1096                                       GEP->getPointerAddressSpace());
1097    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
1098    NewGEP = GetElementPtrInst::Create(
1099        Type::getInt8Ty(GEP->getContext()), NewGEP,
1100        ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
1101        GEP);
1102    NewGEP->copyMetadata(*GEP);
1103    // Inherit the inbounds attribute of the original GEP.
1104    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
1105    if (GEP->getType() != I8PtrTy)
1106      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
1107  }
1108
1109  GEP->replaceAllUsesWith(NewGEP);
1110  GEP->eraseFromParent();
1111
1112  return true;
1113}
1114
1115bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
1116  if (skipFunction(F))
1117    return false;
1118
1119  if (DisableSeparateConstOffsetFromGEP)
1120    return false;
1121
1122  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1123  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1124  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1125  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
1126  bool Changed = false;
1127  for (BasicBlock &B : F) {
1128    for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
1129      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
1130        Changed |= splitGEP(GEP);
1131    // No need to split GEP ConstantExprs because all its indices are constant
1132    // already.
1133  }
1134
1135  Changed |= reuniteExts(F);
1136
1137  if (VerifyNoDeadCode)
1138    verifyNoDeadCode(F);
1139
1140  return Changed;
1141}
1142
1143Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
1144    const SCEV *Key, Instruction *Dominatee) {
1145  auto Pos = DominatingExprs.find(Key);
1146  if (Pos == DominatingExprs.end())
1147    return nullptr;
1148
1149  auto &Candidates = Pos->second;
1150  // Because we process the basic blocks in pre-order of the dominator tree, a
1151  // candidate that doesn't dominate the current instruction won't dominate any
1152  // future instruction either. Therefore, we pop it out of the stack. This
1153  // optimization makes the algorithm O(n).
1154  while (!Candidates.empty()) {
1155    Instruction *Candidate = Candidates.back();
1156    if (DT->dominates(Candidate, Dominatee))
1157      return Candidate;
1158    Candidates.pop_back();
1159  }
1160  return nullptr;
1161}
1162
1163bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
1164  if (!SE->isSCEVable(I->getType()))
1165    return false;
1166
1167  //   Dom: LHS+RHS
1168  //   I: sext(LHS)+sext(RHS)
1169  // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
1170  // TODO: handle zext
1171  Value *LHS = nullptr, *RHS = nullptr;
1172  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) ||
1173      match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
1174    if (LHS->getType() == RHS->getType()) {
1175      const SCEV *Key =
1176          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
1177      if (auto *Dom = findClosestMatchingDominator(Key, I)) {
1178        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
1179        NewSExt->takeName(I);
1180        I->replaceAllUsesWith(NewSExt);
1181        RecursivelyDeleteTriviallyDeadInstructions(I);
1182        return true;
1183      }
1184    }
1185  }
1186
1187  // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
1188  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
1189      match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
1190    if (programUndefinedIfFullPoison(I)) {
1191      const SCEV *Key =
1192          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
1193      DominatingExprs[Key].push_back(I);
1194    }
1195  }
1196  return false;
1197}
1198
1199bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
1200  bool Changed = false;
1201  DominatingExprs.clear();
1202  for (const auto Node : depth_first(DT)) {
1203    BasicBlock *BB = Node->getBlock();
1204    for (auto I = BB->begin(); I != BB->end(); ) {
1205      Instruction *Cur = &*I++;
1206      Changed |= reuniteExts(Cur);
1207    }
1208  }
1209  return Changed;
1210}
1211
1212void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
1213  for (BasicBlock &B : F) {
1214    for (Instruction &I : B) {
1215      if (isInstructionTriviallyDead(&I)) {
1216        std::string ErrMessage;
1217        raw_string_ostream RSO(ErrMessage);
1218        RSO << "Dead instruction detected!\n" << I << "\n";
1219        llvm_unreachable(RSO.str().c_str());
1220      }
1221    }
1222  }
1223}
1224
1225bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
1226    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
1227  if (!FirstGEP || !FirstGEP->hasOneUse())
1228    return false;
1229
1230  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
1231    return false;
1232
1233  if (FirstGEP == SecondGEP)
1234    return false;
1235
1236  unsigned FirstNum = FirstGEP->getNumOperands();
1237  unsigned SecondNum = SecondGEP->getNumOperands();
1238  // Give up if the number of operands are not 2.
1239  if (FirstNum != SecondNum || FirstNum != 2)
1240    return false;
1241
1242  Value *FirstBase = FirstGEP->getOperand(0);
1243  Value *SecondBase = SecondGEP->getOperand(0);
1244  Value *FirstOffset = FirstGEP->getOperand(1);
1245  // Give up if the index of the first GEP is loop invariant.
1246  if (CurLoop->isLoopInvariant(FirstOffset))
1247    return false;
1248
1249  // Give up if base doesn't have same type.
1250  if (FirstBase->getType() != SecondBase->getType())
1251    return false;
1252
1253  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
1254
1255  // Check if the second operand of first GEP has constant coefficient.
1256  // For an example, for the following code,  we won't gain anything by
1257  // hoisting the second GEP out because the second GEP can be folded away.
1258  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
1259  //   %67 = shl i64 %scevgep.sum.ur159, 2
1260  //   %uglygep160 = getelementptr i8* %65, i64 %67
1261  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
1262
1263  // Skip constant shift instruction which may be generated by Splitting GEPs.
1264  if (FirstOffsetDef && FirstOffsetDef->isShift() &&
1265      isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
1266    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
1267
1268  // Give up if FirstOffsetDef is an Add or Sub with constant.
1269  // Because it may not profitable at all due to constant folding.
1270  if (FirstOffsetDef)
1271    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
1272      unsigned opc = BO->getOpcode();
1273      if ((opc == Instruction::Add || opc == Instruction::Sub) &&
1274          (isa<ConstantInt>(BO->getOperand(0)) ||
1275           isa<ConstantInt>(BO->getOperand(1))))
1276        return false;
1277    }
1278  return true;
1279}
1280
1281bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
1282  int UsesInLoop = 0;
1283  for (User *U : V->users()) {
1284    if (Instruction *User = dyn_cast<Instruction>(U))
1285      if (L->contains(User))
1286        if (++UsesInLoop > 1)
1287          return true;
1288  }
1289  return false;
1290}
1291
1292void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
1293                                                GetElementPtrInst *Second) {
1294  Value *Offset1 = First->getOperand(1);
1295  Value *Offset2 = Second->getOperand(1);
1296  First->setOperand(1, Offset2);
1297  Second->setOperand(1, Offset1);
1298
1299  // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
1300  const DataLayout &DAL = First->getModule()->getDataLayout();
1301  APInt Offset(DAL.getIndexSizeInBits(
1302                   cast<PointerType>(First->getType())->getAddressSpace()),
1303               0);
1304  Value *NewBase =
1305      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
1306  uint64_t ObjectSize;
1307  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
1308     Offset.ugt(ObjectSize)) {
1309    First->setIsInBounds(false);
1310    Second->setIsInBounds(false);
1311  } else
1312    First->setIsInBounds(true);
1313}
1314