InterleavedLoadCombinePass.cpp revision 360784
1343171Sdim//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===// 2343171Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6343171Sdim// 7343171Sdim//===----------------------------------------------------------------------===// 8343171Sdim// 9343171Sdim// \file 10343171Sdim// 11343171Sdim// This file defines the interleaved-load-combine pass. The pass searches for 12343171Sdim// ShuffleVectorInstruction that execute interleaving loads. If a matching 13343171Sdim// pattern is found, it adds a combined load and further instructions in a 14343171Sdim// pattern that is detectable by InterleavedAccesPass. The old instructions are 15343171Sdim// left dead to be removed later. The pass is specifically designed to be 16343171Sdim// executed just before InterleavedAccesPass to find any left-over instances 17343171Sdim// that are not detected within former passes. 18343171Sdim// 19343171Sdim//===----------------------------------------------------------------------===// 20343171Sdim 21343171Sdim#include "llvm/ADT/Statistic.h" 22343171Sdim#include "llvm/Analysis/MemoryLocation.h" 23343171Sdim#include "llvm/Analysis/MemorySSA.h" 24343171Sdim#include "llvm/Analysis/MemorySSAUpdater.h" 25343171Sdim#include "llvm/Analysis/OptimizationRemarkEmitter.h" 26343171Sdim#include "llvm/Analysis/TargetTransformInfo.h" 27343171Sdim#include "llvm/CodeGen/Passes.h" 28343171Sdim#include "llvm/CodeGen/TargetLowering.h" 29343171Sdim#include "llvm/CodeGen/TargetPassConfig.h" 30343171Sdim#include "llvm/CodeGen/TargetSubtargetInfo.h" 31343171Sdim#include "llvm/IR/DataLayout.h" 32343171Sdim#include "llvm/IR/Dominators.h" 33343171Sdim#include "llvm/IR/Function.h" 34343171Sdim#include "llvm/IR/Instructions.h" 35343171Sdim#include "llvm/IR/LegacyPassManager.h" 36343171Sdim#include "llvm/IR/Module.h" 37360784Sdim#include "llvm/InitializePasses.h" 38343171Sdim#include "llvm/Pass.h" 39343171Sdim#include "llvm/Support/Debug.h" 40343171Sdim#include "llvm/Support/ErrorHandling.h" 41343171Sdim#include "llvm/Support/raw_ostream.h" 42343171Sdim#include "llvm/Target/TargetMachine.h" 43343171Sdim 44343171Sdim#include <algorithm> 45343171Sdim#include <cassert> 46343171Sdim#include <list> 47343171Sdim 48343171Sdimusing namespace llvm; 49343171Sdim 50343171Sdim#define DEBUG_TYPE "interleaved-load-combine" 51343171Sdim 52343171Sdimnamespace { 53343171Sdim 54343171Sdim/// Statistic counter 55343171SdimSTATISTIC(NumInterleavedLoadCombine, "Number of combined loads"); 56343171Sdim 57343171Sdim/// Option to disable the pass 58343171Sdimstatic cl::opt<bool> DisableInterleavedLoadCombine( 59343171Sdim "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden, 60343171Sdim cl::desc("Disable combining of interleaved loads")); 61343171Sdim 62343171Sdimstruct VectorInfo; 63343171Sdim 64343171Sdimstruct InterleavedLoadCombineImpl { 65343171Sdimpublic: 66343171Sdim InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA, 67343171Sdim TargetMachine &TM) 68343171Sdim : F(F), DT(DT), MSSA(MSSA), 69343171Sdim TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), 70343171Sdim TTI(TM.getTargetTransformInfo(F)) {} 71343171Sdim 72343171Sdim /// Scan the function for interleaved load candidates and execute the 73343171Sdim /// replacement if applicable. 74343171Sdim bool run(); 75343171Sdim 76343171Sdimprivate: 77343171Sdim /// Function this pass is working on 78343171Sdim Function &F; 79343171Sdim 80343171Sdim /// Dominator Tree Analysis 81343171Sdim DominatorTree &DT; 82343171Sdim 83343171Sdim /// Memory Alias Analyses 84343171Sdim MemorySSA &MSSA; 85343171Sdim 86343171Sdim /// Target Lowering Information 87343171Sdim const TargetLowering &TLI; 88343171Sdim 89343171Sdim /// Target Transform Information 90343171Sdim const TargetTransformInfo TTI; 91343171Sdim 92343171Sdim /// Find the instruction in sets LIs that dominates all others, return nullptr 93343171Sdim /// if there is none. 94343171Sdim LoadInst *findFirstLoad(const std::set<LoadInst *> &LIs); 95343171Sdim 96343171Sdim /// Replace interleaved load candidates. It does additional 97343171Sdim /// analyses if this makes sense. Returns true on success and false 98343171Sdim /// of nothing has been changed. 99343171Sdim bool combine(std::list<VectorInfo> &InterleavedLoad, 100343171Sdim OptimizationRemarkEmitter &ORE); 101343171Sdim 102343171Sdim /// Given a set of VectorInfo containing candidates for a given interleave 103343171Sdim /// factor, find a set that represents a 'factor' interleaved load. 104343171Sdim bool findPattern(std::list<VectorInfo> &Candidates, 105343171Sdim std::list<VectorInfo> &InterleavedLoad, unsigned Factor, 106343171Sdim const DataLayout &DL); 107343171Sdim}; // InterleavedLoadCombine 108343171Sdim 109343171Sdim/// First Order Polynomial on an n-Bit Integer Value 110343171Sdim/// 111343171Sdim/// Polynomial(Value) = Value * B + A + E*2^(n-e) 112343171Sdim/// 113343171Sdim/// A and B are the coefficients. E*2^(n-e) is an error within 'e' most 114343171Sdim/// significant bits. It is introduced if an exact computation cannot be proven 115343171Sdim/// (e.q. division by 2). 116343171Sdim/// 117343171Sdim/// As part of this optimization multiple loads will be combined. It necessary 118343171Sdim/// to prove that loads are within some relative offset to each other. This 119343171Sdim/// class is used to prove relative offsets of values loaded from memory. 120343171Sdim/// 121343171Sdim/// Representing an integer in this form is sound since addition in two's 122343171Sdim/// complement is associative (trivial) and multiplication distributes over the 123343171Sdim/// addition (see Proof(1) in Polynomial::mul). Further, both operations 124343171Sdim/// commute. 125343171Sdim// 126343171Sdim// Example: 127343171Sdim// declare @fn(i64 %IDX, <4 x float>* %PTR) { 128343171Sdim// %Pa1 = add i64 %IDX, 2 129343171Sdim// %Pa2 = lshr i64 %Pa1, 1 130343171Sdim// %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2 131343171Sdim// %Va = load <4 x float>, <4 x float>* %Pa3 132343171Sdim// 133343171Sdim// %Pb1 = add i64 %IDX, 4 134343171Sdim// %Pb2 = lshr i64 %Pb1, 1 135343171Sdim// %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2 136343171Sdim// %Vb = load <4 x float>, <4 x float>* %Pb3 137343171Sdim// ... } 138343171Sdim// 139343171Sdim// The goal is to prove that two loads load consecutive addresses. 140343171Sdim// 141343171Sdim// In this case the polynomials are constructed by the following 142343171Sdim// steps. 143343171Sdim// 144343171Sdim// The number tag #e specifies the error bits. 145343171Sdim// 146343171Sdim// Pa_0 = %IDX #0 147343171Sdim// Pa_1 = %IDX + 2 #0 | add 2 148343171Sdim// Pa_2 = %IDX/2 + 1 #1 | lshr 1 149343171Sdim// Pa_3 = %IDX/2 + 1 #1 | GEP, step signext to i64 150343171Sdim// Pa_4 = (%IDX/2)*16 + 16 #0 | GEP, multiply index by sizeof(4) for floats 151343171Sdim// Pa_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components 152343171Sdim// 153343171Sdim// Pb_0 = %IDX #0 154343171Sdim// Pb_1 = %IDX + 4 #0 | add 2 155343171Sdim// Pb_2 = %IDX/2 + 2 #1 | lshr 1 156343171Sdim// Pb_3 = %IDX/2 + 2 #1 | GEP, step signext to i64 157343171Sdim// Pb_4 = (%IDX/2)*16 + 32 #0 | GEP, multiply index by sizeof(4) for floats 158343171Sdim// Pb_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components 159343171Sdim// 160343171Sdim// Pb_5 - Pa_5 = 16 #0 | subtract to get the offset 161343171Sdim// 162343171Sdim// Remark: %PTR is not maintained within this class. So in this instance the 163343171Sdim// offset of 16 can only be assumed if the pointers are equal. 164343171Sdim// 165343171Sdimclass Polynomial { 166343171Sdim /// Operations on B 167343171Sdim enum BOps { 168343171Sdim LShr, 169343171Sdim Mul, 170343171Sdim SExt, 171343171Sdim Trunc, 172343171Sdim }; 173343171Sdim 174343171Sdim /// Number of Error Bits e 175343171Sdim unsigned ErrorMSBs; 176343171Sdim 177343171Sdim /// Value 178343171Sdim Value *V; 179343171Sdim 180343171Sdim /// Coefficient B 181343171Sdim SmallVector<std::pair<BOps, APInt>, 4> B; 182343171Sdim 183343171Sdim /// Coefficient A 184343171Sdim APInt A; 185343171Sdim 186343171Sdimpublic: 187343171Sdim Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() { 188343171Sdim IntegerType *Ty = dyn_cast<IntegerType>(V->getType()); 189343171Sdim if (Ty) { 190343171Sdim ErrorMSBs = 0; 191343171Sdim this->V = V; 192343171Sdim A = APInt(Ty->getBitWidth(), 0); 193343171Sdim } 194343171Sdim } 195343171Sdim 196343171Sdim Polynomial(const APInt &A, unsigned ErrorMSBs = 0) 197343171Sdim : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {} 198343171Sdim 199343171Sdim Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0) 200343171Sdim : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {} 201343171Sdim 202343171Sdim Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {} 203343171Sdim 204343171Sdim /// Increment and clamp the number of undefined bits. 205343171Sdim void incErrorMSBs(unsigned amt) { 206343171Sdim if (ErrorMSBs == (unsigned)-1) 207343171Sdim return; 208343171Sdim 209343171Sdim ErrorMSBs += amt; 210343171Sdim if (ErrorMSBs > A.getBitWidth()) 211343171Sdim ErrorMSBs = A.getBitWidth(); 212343171Sdim } 213343171Sdim 214343171Sdim /// Decrement and clamp the number of undefined bits. 215343171Sdim void decErrorMSBs(unsigned amt) { 216343171Sdim if (ErrorMSBs == (unsigned)-1) 217343171Sdim return; 218343171Sdim 219343171Sdim if (ErrorMSBs > amt) 220343171Sdim ErrorMSBs -= amt; 221343171Sdim else 222343171Sdim ErrorMSBs = 0; 223343171Sdim } 224343171Sdim 225343171Sdim /// Apply an add on the polynomial 226343171Sdim Polynomial &add(const APInt &C) { 227343171Sdim // Note: Addition is associative in two's complement even when in case of 228343171Sdim // signed overflow. 229343171Sdim // 230343171Sdim // Error bits can only propagate into higher significant bits. As these are 231343171Sdim // already regarded as undefined, there is no change. 232343171Sdim // 233343171Sdim // Theorem: Adding a constant to a polynomial does not change the error 234343171Sdim // term. 235343171Sdim // 236343171Sdim // Proof: 237343171Sdim // 238343171Sdim // Since the addition is associative and commutes: 239343171Sdim // 240343171Sdim // (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e) 241343171Sdim // [qed] 242343171Sdim 243343171Sdim if (C.getBitWidth() != A.getBitWidth()) { 244343171Sdim ErrorMSBs = (unsigned)-1; 245343171Sdim return *this; 246343171Sdim } 247343171Sdim 248343171Sdim A += C; 249343171Sdim return *this; 250343171Sdim } 251343171Sdim 252343171Sdim /// Apply a multiplication onto the polynomial. 253343171Sdim Polynomial &mul(const APInt &C) { 254343171Sdim // Note: Multiplication distributes over the addition 255343171Sdim // 256343171Sdim // Theorem: Multiplication distributes over the addition 257343171Sdim // 258343171Sdim // Proof(1): 259343171Sdim // 260343171Sdim // (B+A)*C =- 261343171Sdim // = (B + A) + (B + A) + .. {C Times} 262343171Sdim // addition is associative and commutes, hence 263343171Sdim // = B + B + .. {C Times} .. + A + A + .. {C times} 264343171Sdim // = B*C + A*C 265343171Sdim // (see (function add) for signed values and overflows) 266343171Sdim // [qed] 267343171Sdim // 268343171Sdim // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out 269343171Sdim // to the left. 270343171Sdim // 271343171Sdim // Proof(2): 272343171Sdim // 273343171Sdim // Let B' and A' be the n-Bit inputs with some unknown errors EA, 274343171Sdim // EB at e leading bits. B' and A' can be written down as: 275343171Sdim // 276343171Sdim // B' = B + 2^(n-e)*EB 277343171Sdim // A' = A + 2^(n-e)*EA 278343171Sdim // 279343171Sdim // Let C' be an input with c trailing zero bits. C' can be written as 280343171Sdim // 281343171Sdim // C' = C*2^c 282343171Sdim // 283343171Sdim // Therefore we can compute the result by using distributivity and 284343171Sdim // commutativity. 285343171Sdim // 286343171Sdim // (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' = 287343171Sdim // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' = 288343171Sdim // = (B'+A') * C' = 289343171Sdim // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' = 290343171Sdim // = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' = 291343171Sdim // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' = 292343171Sdim // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c = 293343171Sdim // = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c = 294343171Sdim // 295343171Sdim // Let EC be the final error with EC = C*(EB + EA) 296343171Sdim // 297343171Sdim // = (B + A)*C' + EC*2^(n-e)*2^c = 298343171Sdim // = (B + A)*C' + EC*2^(n-(e-c)) 299343171Sdim // 300343171Sdim // Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c 301343171Sdim // less error bits than the input. c bits are shifted out to the left. 302343171Sdim // [qed] 303343171Sdim 304343171Sdim if (C.getBitWidth() != A.getBitWidth()) { 305343171Sdim ErrorMSBs = (unsigned)-1; 306343171Sdim return *this; 307343171Sdim } 308343171Sdim 309343171Sdim // Multiplying by one is a no-op. 310343171Sdim if (C.isOneValue()) { 311343171Sdim return *this; 312343171Sdim } 313343171Sdim 314343171Sdim // Multiplying by zero removes the coefficient B and defines all bits. 315343171Sdim if (C.isNullValue()) { 316343171Sdim ErrorMSBs = 0; 317343171Sdim deleteB(); 318343171Sdim } 319343171Sdim 320343171Sdim // See Proof(2): Trailing zero bits indicate a left shift. This removes 321343171Sdim // leading bits from the result even if they are undefined. 322343171Sdim decErrorMSBs(C.countTrailingZeros()); 323343171Sdim 324343171Sdim A *= C; 325343171Sdim pushBOperation(Mul, C); 326343171Sdim return *this; 327343171Sdim } 328343171Sdim 329343171Sdim /// Apply a logical shift right on the polynomial 330343171Sdim Polynomial &lshr(const APInt &C) { 331343171Sdim // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e') 332343171Sdim // where 333343171Sdim // e' = e + 1, 334343171Sdim // E is a e-bit number, 335343171Sdim // E' is a e'-bit number, 336343171Sdim // holds under the following precondition: 337343171Sdim // pre(1): A % 2 = 0 338343171Sdim // pre(2): e < n, (see Theorem(2) for the trivial case with e=n) 339343171Sdim // where >> expresses a logical shift to the right, with adding zeros. 340343171Sdim // 341343171Sdim // We need to show that for every, E there is a E' 342343171Sdim // 343343171Sdim // B = b_h * 2^(n-1) + b_m * 2 + b_l 344343171Sdim // A = a_h * 2^(n-1) + a_m * 2 (pre(1)) 345343171Sdim // 346343171Sdim // where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers 347343171Sdim // 348343171Sdim // Let X = (B + A + E*2^(n-e)) >> 1 349343171Sdim // Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1 350343171Sdim // 351343171Sdim // X = [B + A + E*2^(n-e)] >> 1 = 352343171Sdim // = [ b_h * 2^(n-1) + b_m * 2 + b_l + 353343171Sdim // + a_h * 2^(n-1) + a_m * 2 + 354343171Sdim // + E * 2^(n-e) ] >> 1 = 355343171Sdim // 356343171Sdim // The sum is built by putting the overflow of [a_m + b+n] into the term 357343171Sdim // 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within 358343171Sdim // this bit is discarded. This is expressed by % 2. 359343171Sdim // 360343171Sdim // The bit in position 0 cannot overflow into the term (b_m + a_m). 361343171Sdim // 362343171Sdim // = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) + 363343171Sdim // + ((b_m + a_m) % 2^(n-2)) * 2 + 364343171Sdim // + b_l + E * 2^(n-e) ] >> 1 = 365343171Sdim // 366343171Sdim // The shift is computed by dividing the terms by 2 and by cutting off 367343171Sdim // b_l. 368343171Sdim // 369343171Sdim // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + 370343171Sdim // + ((b_m + a_m) % 2^(n-2)) + 371343171Sdim // + E * 2^(n-(e+1)) = 372343171Sdim // 373343171Sdim // by the definition in the Theorem e+1 = e' 374343171Sdim // 375343171Sdim // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + 376343171Sdim // + ((b_m + a_m) % 2^(n-2)) + 377343171Sdim // + E * 2^(n-e') = 378343171Sdim // 379343171Sdim // Compute Y by applying distributivity first 380343171Sdim // 381343171Sdim // Y = (B >> 1) + (A >> 1) + E*2^(n-e') = 382343171Sdim // = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 + 383343171Sdim // + (a_h * 2^(n-1) + a_m * 2) >> 1 + 384343171Sdim // + E * 2^(n-e) >> 1 = 385343171Sdim // 386343171Sdim // Again, the shift is computed by dividing the terms by 2 and by cutting 387343171Sdim // off b_l. 388343171Sdim // 389343171Sdim // = b_h * 2^(n-2) + b_m + 390343171Sdim // + a_h * 2^(n-2) + a_m + 391343171Sdim // + E * 2^(n-(e+1)) = 392343171Sdim // 393343171Sdim // Again, the sum is built by putting the overflow of [a_m + b+n] into 394343171Sdim // the term 2^(n-1). But this time there is room for a second bit in the 395343171Sdim // term 2^(n-2) we add this bit to a new term and denote it o_h in a 396343171Sdim // second step. 397343171Sdim // 398343171Sdim // = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) + 399343171Sdim // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + 400343171Sdim // + ((b_m + a_m) % 2^(n-2)) + 401343171Sdim // + E * 2^(n-(e+1)) = 402343171Sdim // 403343171Sdim // Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1 404343171Sdim // Further replace e+1 by e'. 405343171Sdim // 406343171Sdim // = o_h * 2^(n-1) + 407343171Sdim // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + 408343171Sdim // + ((b_m + a_m) % 2^(n-2)) + 409343171Sdim // + E * 2^(n-e') = 410343171Sdim // 411343171Sdim // Move o_h into the error term and construct E'. To ensure that there is 412343171Sdim // no 2^x with negative x, this step requires pre(2) (e < n). 413343171Sdim // 414343171Sdim // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + 415343171Sdim // + ((b_m + a_m) % 2^(n-2)) + 416343171Sdim // + o_h * 2^(e'-1) * 2^(n-e') + | pre(2), move 2^(e'-1) 417343171Sdim // | out of the old exponent 418343171Sdim // + E * 2^(n-e') = 419343171Sdim // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + 420343171Sdim // + ((b_m + a_m) % 2^(n-2)) + 421343171Sdim // + [o_h * 2^(e'-1) + E] * 2^(n-e') + | move 2^(e'-1) out of 422343171Sdim // | the old exponent 423343171Sdim // 424343171Sdim // Let E' = o_h * 2^(e'-1) + E 425343171Sdim // 426343171Sdim // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + 427343171Sdim // + ((b_m + a_m) % 2^(n-2)) + 428343171Sdim // + E' * 2^(n-e') 429343171Sdim // 430343171Sdim // Because X and Y are distinct only in there error terms and E' can be 431343171Sdim // constructed as shown the theorem holds. 432343171Sdim // [qed] 433343171Sdim // 434343171Sdim // For completeness in case of the case e=n it is also required to show that 435343171Sdim // distributivity can be applied. 436343171Sdim // 437343171Sdim // In this case Theorem(1) transforms to (the pre-condition on A can also be 438343171Sdim // dropped) 439343171Sdim // 440343171Sdim // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E' 441343171Sdim // where 442343171Sdim // A, B, E, E' are two's complement numbers with the same bit 443343171Sdim // width 444343171Sdim // 445343171Sdim // Let A + B + E = X 446343171Sdim // Let (B >> 1) + (A >> 1) = Y 447343171Sdim // 448343171Sdim // Therefore we need to show that for every X and Y there is an E' which 449343171Sdim // makes the equation 450343171Sdim // 451343171Sdim // X = Y + E' 452343171Sdim // 453343171Sdim // hold. This is trivially the case for E' = X - Y. 454343171Sdim // 455343171Sdim // [qed] 456343171Sdim // 457343171Sdim // Remark: Distributing lshr with and arbitrary number n can be expressed as 458343171Sdim // ((((B + A) lshr 1) lshr 1) ... ) {n times}. 459343171Sdim // This construction induces n additional error bits at the left. 460343171Sdim 461343171Sdim if (C.getBitWidth() != A.getBitWidth()) { 462343171Sdim ErrorMSBs = (unsigned)-1; 463343171Sdim return *this; 464343171Sdim } 465343171Sdim 466343171Sdim if (C.isNullValue()) 467343171Sdim return *this; 468343171Sdim 469343171Sdim // Test if the result will be zero 470343171Sdim unsigned shiftAmt = C.getZExtValue(); 471343171Sdim if (shiftAmt >= C.getBitWidth()) 472343171Sdim return mul(APInt(C.getBitWidth(), 0)); 473343171Sdim 474343171Sdim // The proof that shiftAmt LSBs are zero for at least one summand is only 475343171Sdim // possible for the constant number. 476343171Sdim // 477343171Sdim // If this can be proven add shiftAmt to the error counter 478343171Sdim // `ErrorMSBs`. Otherwise set all bits as undefined. 479343171Sdim if (A.countTrailingZeros() < shiftAmt) 480343171Sdim ErrorMSBs = A.getBitWidth(); 481343171Sdim else 482343171Sdim incErrorMSBs(shiftAmt); 483343171Sdim 484343171Sdim // Apply the operation. 485343171Sdim pushBOperation(LShr, C); 486343171Sdim A = A.lshr(shiftAmt); 487343171Sdim 488343171Sdim return *this; 489343171Sdim } 490343171Sdim 491343171Sdim /// Apply a sign-extend or truncate operation on the polynomial. 492343171Sdim Polynomial &sextOrTrunc(unsigned n) { 493343171Sdim if (n < A.getBitWidth()) { 494343171Sdim // Truncate: Clearly undefined Bits on the MSB side are removed 495343171Sdim // if there are any. 496343171Sdim decErrorMSBs(A.getBitWidth() - n); 497343171Sdim A = A.trunc(n); 498343171Sdim pushBOperation(Trunc, APInt(sizeof(n) * 8, n)); 499343171Sdim } 500343171Sdim if (n > A.getBitWidth()) { 501343171Sdim // Extend: Clearly extending first and adding later is different 502343171Sdim // to adding first and extending later in all extended bits. 503343171Sdim incErrorMSBs(n - A.getBitWidth()); 504343171Sdim A = A.sext(n); 505343171Sdim pushBOperation(SExt, APInt(sizeof(n) * 8, n)); 506343171Sdim } 507343171Sdim 508343171Sdim return *this; 509343171Sdim } 510343171Sdim 511343171Sdim /// Test if there is a coefficient B. 512343171Sdim bool isFirstOrder() const { return V != nullptr; } 513343171Sdim 514343171Sdim /// Test coefficient B of two Polynomials are equal. 515343171Sdim bool isCompatibleTo(const Polynomial &o) const { 516343171Sdim // The polynomial use different bit width. 517343171Sdim if (A.getBitWidth() != o.A.getBitWidth()) 518343171Sdim return false; 519343171Sdim 520343171Sdim // If neither Polynomial has the Coefficient B. 521343171Sdim if (!isFirstOrder() && !o.isFirstOrder()) 522343171Sdim return true; 523343171Sdim 524343171Sdim // The index variable is different. 525343171Sdim if (V != o.V) 526343171Sdim return false; 527343171Sdim 528343171Sdim // Check the operations. 529343171Sdim if (B.size() != o.B.size()) 530343171Sdim return false; 531343171Sdim 532343171Sdim auto ob = o.B.begin(); 533343171Sdim for (auto &b : B) { 534343171Sdim if (b != *ob) 535343171Sdim return false; 536343171Sdim ob++; 537343171Sdim } 538343171Sdim 539343171Sdim return true; 540343171Sdim } 541343171Sdim 542343171Sdim /// Subtract two polynomials, return an undefined polynomial if 543343171Sdim /// subtraction is not possible. 544343171Sdim Polynomial operator-(const Polynomial &o) const { 545343171Sdim // Return an undefined polynomial if incompatible. 546343171Sdim if (!isCompatibleTo(o)) 547343171Sdim return Polynomial(); 548343171Sdim 549343171Sdim // If the polynomials are compatible (meaning they have the same 550343171Sdim // coefficient on B), B is eliminated. Thus a polynomial solely 551343171Sdim // containing A is returned 552343171Sdim return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs)); 553343171Sdim } 554343171Sdim 555343171Sdim /// Subtract a constant from a polynomial, 556343171Sdim Polynomial operator-(uint64_t C) const { 557343171Sdim Polynomial Result(*this); 558343171Sdim Result.A -= C; 559343171Sdim return Result; 560343171Sdim } 561343171Sdim 562343171Sdim /// Add a constant to a polynomial, 563343171Sdim Polynomial operator+(uint64_t C) const { 564343171Sdim Polynomial Result(*this); 565343171Sdim Result.A += C; 566343171Sdim return Result; 567343171Sdim } 568343171Sdim 569343171Sdim /// Returns true if it can be proven that two Polynomials are equal. 570343171Sdim bool isProvenEqualTo(const Polynomial &o) { 571343171Sdim // Subtract both polynomials and test if it is fully defined and zero. 572343171Sdim Polynomial r = *this - o; 573343171Sdim return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isNullValue()); 574343171Sdim } 575343171Sdim 576343171Sdim /// Print the polynomial into a stream. 577343171Sdim void print(raw_ostream &OS) const { 578343171Sdim OS << "[{#ErrBits:" << ErrorMSBs << "} "; 579343171Sdim 580343171Sdim if (V) { 581343171Sdim for (auto b : B) 582343171Sdim OS << "("; 583343171Sdim OS << "(" << *V << ") "; 584343171Sdim 585343171Sdim for (auto b : B) { 586343171Sdim switch (b.first) { 587343171Sdim case LShr: 588343171Sdim OS << "LShr "; 589343171Sdim break; 590343171Sdim case Mul: 591343171Sdim OS << "Mul "; 592343171Sdim break; 593343171Sdim case SExt: 594343171Sdim OS << "SExt "; 595343171Sdim break; 596343171Sdim case Trunc: 597343171Sdim OS << "Trunc "; 598343171Sdim break; 599343171Sdim } 600343171Sdim 601343171Sdim OS << b.second << ") "; 602343171Sdim } 603343171Sdim } 604343171Sdim 605343171Sdim OS << "+ " << A << "]"; 606343171Sdim } 607343171Sdim 608343171Sdimprivate: 609343171Sdim void deleteB() { 610343171Sdim V = nullptr; 611343171Sdim B.clear(); 612343171Sdim } 613343171Sdim 614343171Sdim void pushBOperation(const BOps Op, const APInt &C) { 615343171Sdim if (isFirstOrder()) { 616343171Sdim B.push_back(std::make_pair(Op, C)); 617343171Sdim return; 618343171Sdim } 619343171Sdim } 620343171Sdim}; 621343171Sdim 622343171Sdim#ifndef NDEBUG 623343171Sdimstatic raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) { 624343171Sdim S.print(OS); 625343171Sdim return OS; 626343171Sdim} 627343171Sdim#endif 628343171Sdim 629343171Sdim/// VectorInfo stores abstract the following information for each vector 630343171Sdim/// element: 631343171Sdim/// 632343171Sdim/// 1) The the memory address loaded into the element as Polynomial 633343171Sdim/// 2) a set of load instruction necessary to construct the vector, 634343171Sdim/// 3) a set of all other instructions that are necessary to create the vector and 635343171Sdim/// 4) a pointer value that can be used as relative base for all elements. 636343171Sdimstruct VectorInfo { 637343171Sdimprivate: 638343171Sdim VectorInfo(const VectorInfo &c) : VTy(c.VTy) { 639343171Sdim llvm_unreachable( 640343171Sdim "Copying VectorInfo is neither implemented nor necessary,"); 641343171Sdim } 642343171Sdim 643343171Sdimpublic: 644343171Sdim /// Information of a Vector Element 645343171Sdim struct ElementInfo { 646343171Sdim /// Offset Polynomial. 647343171Sdim Polynomial Ofs; 648343171Sdim 649343171Sdim /// The Load Instruction used to Load the entry. LI is null if the pointer 650343171Sdim /// of the load instruction does not point on to the entry 651343171Sdim LoadInst *LI; 652343171Sdim 653343171Sdim ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr) 654343171Sdim : Ofs(Offset), LI(LI) {} 655343171Sdim }; 656343171Sdim 657343171Sdim /// Basic-block the load instructions are within 658343171Sdim BasicBlock *BB; 659343171Sdim 660343171Sdim /// Pointer value of all participation load instructions 661343171Sdim Value *PV; 662343171Sdim 663343171Sdim /// Participating load instructions 664343171Sdim std::set<LoadInst *> LIs; 665343171Sdim 666343171Sdim /// Participating instructions 667343171Sdim std::set<Instruction *> Is; 668343171Sdim 669343171Sdim /// Final shuffle-vector instruction 670343171Sdim ShuffleVectorInst *SVI; 671343171Sdim 672343171Sdim /// Information of the offset for each vector element 673343171Sdim ElementInfo *EI; 674343171Sdim 675343171Sdim /// Vector Type 676343171Sdim VectorType *const VTy; 677343171Sdim 678343171Sdim VectorInfo(VectorType *VTy) 679343171Sdim : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) { 680343171Sdim EI = new ElementInfo[VTy->getNumElements()]; 681343171Sdim } 682343171Sdim 683343171Sdim virtual ~VectorInfo() { delete[] EI; } 684343171Sdim 685343171Sdim unsigned getDimension() const { return VTy->getNumElements(); } 686343171Sdim 687343171Sdim /// Test if the VectorInfo can be part of an interleaved load with the 688343171Sdim /// specified factor. 689343171Sdim /// 690343171Sdim /// \param Factor of the interleave 691343171Sdim /// \param DL Targets Datalayout 692343171Sdim /// 693343171Sdim /// \returns true if this is possible and false if not 694343171Sdim bool isInterleaved(unsigned Factor, const DataLayout &DL) const { 695343171Sdim unsigned Size = DL.getTypeAllocSize(VTy->getElementType()); 696343171Sdim for (unsigned i = 1; i < getDimension(); i++) { 697343171Sdim if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) { 698343171Sdim return false; 699343171Sdim } 700343171Sdim } 701343171Sdim return true; 702343171Sdim } 703343171Sdim 704343171Sdim /// Recursively computes the vector information stored in V. 705343171Sdim /// 706343171Sdim /// This function delegates the work to specialized implementations 707343171Sdim /// 708343171Sdim /// \param V Value to operate on 709343171Sdim /// \param Result Result of the computation 710343171Sdim /// 711343171Sdim /// \returns false if no sensible information can be gathered. 712343171Sdim static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) { 713343171Sdim ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V); 714343171Sdim if (SVI) 715343171Sdim return computeFromSVI(SVI, Result, DL); 716343171Sdim LoadInst *LI = dyn_cast<LoadInst>(V); 717343171Sdim if (LI) 718343171Sdim return computeFromLI(LI, Result, DL); 719343171Sdim BitCastInst *BCI = dyn_cast<BitCastInst>(V); 720343171Sdim if (BCI) 721343171Sdim return computeFromBCI(BCI, Result, DL); 722343171Sdim return false; 723343171Sdim } 724343171Sdim 725343171Sdim /// BitCastInst specialization to compute the vector information. 726343171Sdim /// 727343171Sdim /// \param BCI BitCastInst to operate on 728343171Sdim /// \param Result Result of the computation 729343171Sdim /// 730343171Sdim /// \returns false if no sensible information can be gathered. 731343171Sdim static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result, 732343171Sdim const DataLayout &DL) { 733343171Sdim Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0)); 734343171Sdim 735343171Sdim if (!Op) 736343171Sdim return false; 737343171Sdim 738343171Sdim VectorType *VTy = dyn_cast<VectorType>(Op->getType()); 739343171Sdim if (!VTy) 740343171Sdim return false; 741343171Sdim 742343171Sdim // We can only cast from large to smaller vectors 743343171Sdim if (Result.VTy->getNumElements() % VTy->getNumElements()) 744343171Sdim return false; 745343171Sdim 746343171Sdim unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements(); 747343171Sdim unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType()); 748343171Sdim unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType()); 749343171Sdim 750343171Sdim if (NewSize * Factor != OldSize) 751343171Sdim return false; 752343171Sdim 753343171Sdim VectorInfo Old(VTy); 754343171Sdim if (!compute(Op, Old, DL)) 755343171Sdim return false; 756343171Sdim 757343171Sdim for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) { 758343171Sdim for (unsigned j = 0; j < Factor; j++) { 759343171Sdim Result.EI[i + j] = 760343171Sdim ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize, 761343171Sdim j == 0 ? Old.EI[i / Factor].LI : nullptr); 762343171Sdim } 763343171Sdim } 764343171Sdim 765343171Sdim Result.BB = Old.BB; 766343171Sdim Result.PV = Old.PV; 767343171Sdim Result.LIs.insert(Old.LIs.begin(), Old.LIs.end()); 768343171Sdim Result.Is.insert(Old.Is.begin(), Old.Is.end()); 769343171Sdim Result.Is.insert(BCI); 770343171Sdim Result.SVI = nullptr; 771343171Sdim 772343171Sdim return true; 773343171Sdim } 774343171Sdim 775343171Sdim /// ShuffleVectorInst specialization to compute vector information. 776343171Sdim /// 777343171Sdim /// \param SVI ShuffleVectorInst to operate on 778343171Sdim /// \param Result Result of the computation 779343171Sdim /// 780343171Sdim /// Compute the left and the right side vector information and merge them by 781343171Sdim /// applying the shuffle operation. This function also ensures that the left 782343171Sdim /// and right side have compatible loads. This means that all loads are with 783343171Sdim /// in the same basic block and are based on the same pointer. 784343171Sdim /// 785343171Sdim /// \returns false if no sensible information can be gathered. 786343171Sdim static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result, 787343171Sdim const DataLayout &DL) { 788343171Sdim VectorType *ArgTy = dyn_cast<VectorType>(SVI->getOperand(0)->getType()); 789343171Sdim assert(ArgTy && "ShuffleVector Operand is not a VectorType"); 790343171Sdim 791343171Sdim // Compute the left hand vector information. 792343171Sdim VectorInfo LHS(ArgTy); 793343171Sdim if (!compute(SVI->getOperand(0), LHS, DL)) 794343171Sdim LHS.BB = nullptr; 795343171Sdim 796343171Sdim // Compute the right hand vector information. 797343171Sdim VectorInfo RHS(ArgTy); 798343171Sdim if (!compute(SVI->getOperand(1), RHS, DL)) 799343171Sdim RHS.BB = nullptr; 800343171Sdim 801343171Sdim // Neither operand produced sensible results? 802343171Sdim if (!LHS.BB && !RHS.BB) 803343171Sdim return false; 804343171Sdim // Only RHS produced sensible results? 805343171Sdim else if (!LHS.BB) { 806343171Sdim Result.BB = RHS.BB; 807343171Sdim Result.PV = RHS.PV; 808343171Sdim } 809343171Sdim // Only LHS produced sensible results? 810343171Sdim else if (!RHS.BB) { 811343171Sdim Result.BB = LHS.BB; 812343171Sdim Result.PV = LHS.PV; 813343171Sdim } 814343171Sdim // Both operands produced sensible results? 815343171Sdim else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) { 816343171Sdim Result.BB = LHS.BB; 817343171Sdim Result.PV = LHS.PV; 818343171Sdim } 819343171Sdim // Both operands produced sensible results but they are incompatible. 820343171Sdim else { 821343171Sdim return false; 822343171Sdim } 823343171Sdim 824343171Sdim // Merge and apply the operation on the offset information. 825343171Sdim if (LHS.BB) { 826343171Sdim Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end()); 827343171Sdim Result.Is.insert(LHS.Is.begin(), LHS.Is.end()); 828343171Sdim } 829343171Sdim if (RHS.BB) { 830343171Sdim Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end()); 831343171Sdim Result.Is.insert(RHS.Is.begin(), RHS.Is.end()); 832343171Sdim } 833343171Sdim Result.Is.insert(SVI); 834343171Sdim Result.SVI = SVI; 835343171Sdim 836343171Sdim int j = 0; 837343171Sdim for (int i : SVI->getShuffleMask()) { 838343171Sdim assert((i < 2 * (signed)ArgTy->getNumElements()) && 839343171Sdim "Invalid ShuffleVectorInst (index out of bounds)"); 840343171Sdim 841343171Sdim if (i < 0) 842343171Sdim Result.EI[j] = ElementInfo(); 843343171Sdim else if (i < (signed)ArgTy->getNumElements()) { 844343171Sdim if (LHS.BB) 845343171Sdim Result.EI[j] = LHS.EI[i]; 846343171Sdim else 847343171Sdim Result.EI[j] = ElementInfo(); 848343171Sdim } else { 849343171Sdim if (RHS.BB) 850343171Sdim Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()]; 851343171Sdim else 852343171Sdim Result.EI[j] = ElementInfo(); 853343171Sdim } 854343171Sdim j++; 855343171Sdim } 856343171Sdim 857343171Sdim return true; 858343171Sdim } 859343171Sdim 860343171Sdim /// LoadInst specialization to compute vector information. 861343171Sdim /// 862343171Sdim /// This function also acts as abort condition to the recursion. 863343171Sdim /// 864343171Sdim /// \param LI LoadInst to operate on 865343171Sdim /// \param Result Result of the computation 866343171Sdim /// 867343171Sdim /// \returns false if no sensible information can be gathered. 868343171Sdim static bool computeFromLI(LoadInst *LI, VectorInfo &Result, 869343171Sdim const DataLayout &DL) { 870343171Sdim Value *BasePtr; 871343171Sdim Polynomial Offset; 872343171Sdim 873343171Sdim if (LI->isVolatile()) 874343171Sdim return false; 875343171Sdim 876343171Sdim if (LI->isAtomic()) 877343171Sdim return false; 878343171Sdim 879343171Sdim // Get the base polynomial 880343171Sdim computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL); 881343171Sdim 882343171Sdim Result.BB = LI->getParent(); 883343171Sdim Result.PV = BasePtr; 884343171Sdim Result.LIs.insert(LI); 885343171Sdim Result.Is.insert(LI); 886343171Sdim 887343171Sdim for (unsigned i = 0; i < Result.getDimension(); i++) { 888343171Sdim Value *Idx[2] = { 889343171Sdim ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0), 890343171Sdim ConstantInt::get(Type::getInt32Ty(LI->getContext()), i), 891343171Sdim }; 892343171Sdim int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, makeArrayRef(Idx, 2)); 893343171Sdim Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr); 894343171Sdim } 895343171Sdim 896343171Sdim return true; 897343171Sdim } 898343171Sdim 899343171Sdim /// Recursively compute polynomial of a value. 900343171Sdim /// 901343171Sdim /// \param BO Input binary operation 902343171Sdim /// \param Result Result polynomial 903343171Sdim static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) { 904343171Sdim Value *LHS = BO.getOperand(0); 905343171Sdim Value *RHS = BO.getOperand(1); 906343171Sdim 907343171Sdim // Find the RHS Constant if any 908343171Sdim ConstantInt *C = dyn_cast<ConstantInt>(RHS); 909343171Sdim if ((!C) && BO.isCommutative()) { 910343171Sdim C = dyn_cast<ConstantInt>(LHS); 911343171Sdim if (C) 912343171Sdim std::swap(LHS, RHS); 913343171Sdim } 914343171Sdim 915343171Sdim switch (BO.getOpcode()) { 916343171Sdim case Instruction::Add: 917343171Sdim if (!C) 918343171Sdim break; 919343171Sdim 920343171Sdim computePolynomial(*LHS, Result); 921343171Sdim Result.add(C->getValue()); 922343171Sdim return; 923343171Sdim 924343171Sdim case Instruction::LShr: 925343171Sdim if (!C) 926343171Sdim break; 927343171Sdim 928343171Sdim computePolynomial(*LHS, Result); 929343171Sdim Result.lshr(C->getValue()); 930343171Sdim return; 931343171Sdim 932343171Sdim default: 933343171Sdim break; 934343171Sdim } 935343171Sdim 936343171Sdim Result = Polynomial(&BO); 937343171Sdim } 938343171Sdim 939343171Sdim /// Recursively compute polynomial of a value 940343171Sdim /// 941343171Sdim /// \param V input value 942343171Sdim /// \param Result result polynomial 943343171Sdim static void computePolynomial(Value &V, Polynomial &Result) { 944360784Sdim if (auto *BO = dyn_cast<BinaryOperator>(&V)) 945360784Sdim computePolynomialBinOp(*BO, Result); 946343171Sdim else 947343171Sdim Result = Polynomial(&V); 948343171Sdim } 949343171Sdim 950343171Sdim /// Compute the Polynomial representation of a Pointer type. 951343171Sdim /// 952343171Sdim /// \param Ptr input pointer value 953343171Sdim /// \param Result result polynomial 954343171Sdim /// \param BasePtr pointer the polynomial is based on 955343171Sdim /// \param DL Datalayout of the target machine 956343171Sdim static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result, 957343171Sdim Value *&BasePtr, 958343171Sdim const DataLayout &DL) { 959343171Sdim // Not a pointer type? Return an undefined polynomial 960343171Sdim PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType()); 961343171Sdim if (!PtrTy) { 962343171Sdim Result = Polynomial(); 963343171Sdim BasePtr = nullptr; 964353358Sdim return; 965343171Sdim } 966343171Sdim unsigned PointerBits = 967343171Sdim DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()); 968343171Sdim 969343171Sdim /// Skip pointer casts. Return Zero polynomial otherwise 970343171Sdim if (isa<CastInst>(&Ptr)) { 971343171Sdim CastInst &CI = *cast<CastInst>(&Ptr); 972343171Sdim switch (CI.getOpcode()) { 973343171Sdim case Instruction::BitCast: 974343171Sdim computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL); 975343171Sdim break; 976343171Sdim default: 977343171Sdim BasePtr = &Ptr; 978343171Sdim Polynomial(PointerBits, 0); 979343171Sdim break; 980343171Sdim } 981343171Sdim } 982343171Sdim /// Resolve GetElementPtrInst. 983343171Sdim else if (isa<GetElementPtrInst>(&Ptr)) { 984343171Sdim GetElementPtrInst &GEP = *cast<GetElementPtrInst>(&Ptr); 985343171Sdim 986343171Sdim APInt BaseOffset(PointerBits, 0); 987343171Sdim 988343171Sdim // Check if we can compute the Offset with accumulateConstantOffset 989343171Sdim if (GEP.accumulateConstantOffset(DL, BaseOffset)) { 990343171Sdim Result = Polynomial(BaseOffset); 991343171Sdim BasePtr = GEP.getPointerOperand(); 992343171Sdim return; 993343171Sdim } else { 994343171Sdim // Otherwise we allow that the last index operand of the GEP is 995343171Sdim // non-constant. 996343171Sdim unsigned idxOperand, e; 997343171Sdim SmallVector<Value *, 4> Indices; 998343171Sdim for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e; 999343171Sdim idxOperand++) { 1000343171Sdim ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand)); 1001343171Sdim if (!IDX) 1002343171Sdim break; 1003343171Sdim Indices.push_back(IDX); 1004343171Sdim } 1005343171Sdim 1006343171Sdim // It must also be the last operand. 1007343171Sdim if (idxOperand + 1 != e) { 1008343171Sdim Result = Polynomial(); 1009343171Sdim BasePtr = nullptr; 1010343171Sdim return; 1011343171Sdim } 1012343171Sdim 1013343171Sdim // Compute the polynomial of the index operand. 1014343171Sdim computePolynomial(*GEP.getOperand(idxOperand), Result); 1015343171Sdim 1016343171Sdim // Compute base offset from zero based index, excluding the last 1017343171Sdim // variable operand. 1018343171Sdim BaseOffset = 1019343171Sdim DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices); 1020343171Sdim 1021343171Sdim // Apply the operations of GEP to the polynomial. 1022343171Sdim unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType()); 1023343171Sdim Result.sextOrTrunc(PointerBits); 1024343171Sdim Result.mul(APInt(PointerBits, ResultSize)); 1025343171Sdim Result.add(BaseOffset); 1026343171Sdim BasePtr = GEP.getPointerOperand(); 1027343171Sdim } 1028343171Sdim } 1029343171Sdim // All other instructions are handled by using the value as base pointer and 1030343171Sdim // a zero polynomial. 1031343171Sdim else { 1032343171Sdim BasePtr = &Ptr; 1033343171Sdim Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0); 1034343171Sdim } 1035343171Sdim } 1036343171Sdim 1037343171Sdim#ifndef NDEBUG 1038343171Sdim void print(raw_ostream &OS) const { 1039343171Sdim if (PV) 1040343171Sdim OS << *PV; 1041343171Sdim else 1042343171Sdim OS << "(none)"; 1043343171Sdim OS << " + "; 1044343171Sdim for (unsigned i = 0; i < getDimension(); i++) 1045343171Sdim OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs; 1046343171Sdim OS << "]"; 1047343171Sdim } 1048343171Sdim#endif 1049343171Sdim}; 1050343171Sdim 1051343171Sdim} // anonymous namespace 1052343171Sdim 1053343171Sdimbool InterleavedLoadCombineImpl::findPattern( 1054343171Sdim std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad, 1055343171Sdim unsigned Factor, const DataLayout &DL) { 1056343171Sdim for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) { 1057343171Sdim unsigned i; 1058343171Sdim // Try to find an interleaved load using the front of Worklist as first line 1059343171Sdim unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType()); 1060343171Sdim 1061343171Sdim // List containing iterators pointing to the VectorInfos of the candidates 1062343171Sdim std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end()); 1063343171Sdim 1064343171Sdim for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) { 1065343171Sdim if (C->VTy != C0->VTy) 1066343171Sdim continue; 1067343171Sdim if (C->BB != C0->BB) 1068343171Sdim continue; 1069343171Sdim if (C->PV != C0->PV) 1070343171Sdim continue; 1071343171Sdim 1072343171Sdim // Check the current value matches any of factor - 1 remaining lines 1073343171Sdim for (i = 1; i < Factor; i++) { 1074343171Sdim if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) { 1075343171Sdim Res[i] = C; 1076343171Sdim } 1077343171Sdim } 1078343171Sdim 1079343171Sdim for (i = 1; i < Factor; i++) { 1080343171Sdim if (Res[i] == Candidates.end()) 1081343171Sdim break; 1082343171Sdim } 1083343171Sdim if (i == Factor) { 1084343171Sdim Res[0] = C0; 1085343171Sdim break; 1086343171Sdim } 1087343171Sdim } 1088343171Sdim 1089343171Sdim if (Res[0] != Candidates.end()) { 1090343171Sdim // Move the result into the output 1091343171Sdim for (unsigned i = 0; i < Factor; i++) { 1092343171Sdim InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]); 1093343171Sdim } 1094343171Sdim 1095343171Sdim return true; 1096343171Sdim } 1097343171Sdim } 1098343171Sdim return false; 1099343171Sdim} 1100343171Sdim 1101343171SdimLoadInst * 1102343171SdimInterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) { 1103343171Sdim assert(!LIs.empty() && "No load instructions given."); 1104343171Sdim 1105343171Sdim // All LIs are within the same BB. Select the first for a reference. 1106343171Sdim BasicBlock *BB = (*LIs.begin())->getParent(); 1107343171Sdim BasicBlock::iterator FLI = 1108343171Sdim std::find_if(BB->begin(), BB->end(), [&LIs](Instruction &I) -> bool { 1109343171Sdim return is_contained(LIs, &I); 1110343171Sdim }); 1111343171Sdim assert(FLI != BB->end()); 1112343171Sdim 1113343171Sdim return cast<LoadInst>(FLI); 1114343171Sdim} 1115343171Sdim 1116343171Sdimbool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad, 1117343171Sdim OptimizationRemarkEmitter &ORE) { 1118343171Sdim LLVM_DEBUG(dbgs() << "Checking interleaved load\n"); 1119343171Sdim 1120343171Sdim // The insertion point is the LoadInst which loads the first values. The 1121343171Sdim // following tests are used to proof that the combined load can be inserted 1122343171Sdim // just before InsertionPoint. 1123343171Sdim LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI; 1124343171Sdim 1125343171Sdim // Test if the offset is computed 1126343171Sdim if (!InsertionPoint) 1127343171Sdim return false; 1128343171Sdim 1129343171Sdim std::set<LoadInst *> LIs; 1130343171Sdim std::set<Instruction *> Is; 1131343171Sdim std::set<Instruction *> SVIs; 1132343171Sdim 1133343171Sdim unsigned InterleavedCost; 1134343171Sdim unsigned InstructionCost = 0; 1135343171Sdim 1136343171Sdim // Get the interleave factor 1137343171Sdim unsigned Factor = InterleavedLoad.size(); 1138343171Sdim 1139343171Sdim // Merge all input sets used in analysis 1140343171Sdim for (auto &VI : InterleavedLoad) { 1141343171Sdim // Generate a set of all load instructions to be combined 1142343171Sdim LIs.insert(VI.LIs.begin(), VI.LIs.end()); 1143343171Sdim 1144343171Sdim // Generate a set of all instructions taking part in load 1145343171Sdim // interleaved. This list excludes the instructions necessary for the 1146343171Sdim // polynomial construction. 1147343171Sdim Is.insert(VI.Is.begin(), VI.Is.end()); 1148343171Sdim 1149343171Sdim // Generate the set of the final ShuffleVectorInst. 1150343171Sdim SVIs.insert(VI.SVI); 1151343171Sdim } 1152343171Sdim 1153343171Sdim // There is nothing to combine. 1154343171Sdim if (LIs.size() < 2) 1155343171Sdim return false; 1156343171Sdim 1157343171Sdim // Test if all participating instruction will be dead after the 1158343171Sdim // transformation. If intermediate results are used, no performance gain can 1159343171Sdim // be expected. Also sum the cost of the Instructions beeing left dead. 1160343171Sdim for (auto &I : Is) { 1161343171Sdim // Compute the old cost 1162343171Sdim InstructionCost += 1163343171Sdim TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency); 1164343171Sdim 1165343171Sdim // The final SVIs are allowed not to be dead, all uses will be replaced 1166343171Sdim if (SVIs.find(I) != SVIs.end()) 1167343171Sdim continue; 1168343171Sdim 1169343171Sdim // If there are users outside the set to be eliminated, we abort the 1170343171Sdim // transformation. No gain can be expected. 1171360784Sdim for (auto *U : I->users()) { 1172343171Sdim if (Is.find(dyn_cast<Instruction>(U)) == Is.end()) 1173343171Sdim return false; 1174343171Sdim } 1175343171Sdim } 1176343171Sdim 1177343171Sdim // We know that all LoadInst are within the same BB. This guarantees that 1178343171Sdim // either everything or nothing is loaded. 1179343171Sdim LoadInst *First = findFirstLoad(LIs); 1180343171Sdim 1181343171Sdim // To be safe that the loads can be combined, iterate over all loads and test 1182343171Sdim // that the corresponding defining access dominates first LI. This guarantees 1183343171Sdim // that there are no aliasing stores in between the loads. 1184343171Sdim auto FMA = MSSA.getMemoryAccess(First); 1185343171Sdim for (auto LI : LIs) { 1186343171Sdim auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess(); 1187343171Sdim if (!MSSA.dominates(MADef, FMA)) 1188343171Sdim return false; 1189343171Sdim } 1190343171Sdim assert(!LIs.empty() && "There are no LoadInst to combine"); 1191343171Sdim 1192343171Sdim // It is necessary that insertion point dominates all final ShuffleVectorInst. 1193343171Sdim for (auto &VI : InterleavedLoad) { 1194343171Sdim if (!DT.dominates(InsertionPoint, VI.SVI)) 1195343171Sdim return false; 1196343171Sdim } 1197343171Sdim 1198343171Sdim // All checks are done. Add instructions detectable by InterleavedAccessPass 1199343171Sdim // The old instruction will are left dead. 1200343171Sdim IRBuilder<> Builder(InsertionPoint); 1201343171Sdim Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType(); 1202343171Sdim unsigned ElementsPerSVI = 1203343171Sdim InterleavedLoad.front().SVI->getType()->getNumElements(); 1204343171Sdim VectorType *ILTy = VectorType::get(ETy, Factor * ElementsPerSVI); 1205343171Sdim 1206343171Sdim SmallVector<unsigned, 4> Indices; 1207343171Sdim for (unsigned i = 0; i < Factor; i++) 1208343171Sdim Indices.push_back(i); 1209343171Sdim InterleavedCost = TTI.getInterleavedMemoryOpCost( 1210343171Sdim Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(), 1211343171Sdim InsertionPoint->getPointerAddressSpace()); 1212343171Sdim 1213343171Sdim if (InterleavedCost >= InstructionCost) { 1214343171Sdim return false; 1215343171Sdim } 1216343171Sdim 1217343171Sdim // Create a pointer cast for the wide load. 1218343171Sdim auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0), 1219343171Sdim ILTy->getPointerTo(), 1220343171Sdim "interleaved.wide.ptrcast"); 1221343171Sdim 1222343171Sdim // Create the wide load and update the MemorySSA. 1223353358Sdim auto LI = Builder.CreateAlignedLoad(ILTy, CI, InsertionPoint->getAlignment(), 1224343171Sdim "interleaved.wide.load"); 1225343171Sdim auto MSSAU = MemorySSAUpdater(&MSSA); 1226343171Sdim MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore( 1227343171Sdim LI, nullptr, MSSA.getMemoryAccess(InsertionPoint))); 1228343171Sdim MSSAU.insertUse(MSSALoad); 1229343171Sdim 1230343171Sdim // Create the final SVIs and replace all uses. 1231343171Sdim int i = 0; 1232343171Sdim for (auto &VI : InterleavedLoad) { 1233343171Sdim SmallVector<uint32_t, 4> Mask; 1234343171Sdim for (unsigned j = 0; j < ElementsPerSVI; j++) 1235343171Sdim Mask.push_back(i + j * Factor); 1236343171Sdim 1237343171Sdim Builder.SetInsertPoint(VI.SVI); 1238343171Sdim auto SVI = Builder.CreateShuffleVector(LI, UndefValue::get(LI->getType()), 1239343171Sdim Mask, "interleaved.shuffle"); 1240343171Sdim VI.SVI->replaceAllUsesWith(SVI); 1241343171Sdim i++; 1242343171Sdim } 1243343171Sdim 1244343171Sdim NumInterleavedLoadCombine++; 1245343171Sdim ORE.emit([&]() { 1246343171Sdim return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI) 1247343171Sdim << "Load interleaved combined with factor " 1248343171Sdim << ore::NV("Factor", Factor); 1249343171Sdim }); 1250343171Sdim 1251343171Sdim return true; 1252343171Sdim} 1253343171Sdim 1254343171Sdimbool InterleavedLoadCombineImpl::run() { 1255343171Sdim OptimizationRemarkEmitter ORE(&F); 1256343171Sdim bool changed = false; 1257343171Sdim unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor(); 1258343171Sdim 1259343171Sdim auto &DL = F.getParent()->getDataLayout(); 1260343171Sdim 1261343171Sdim // Start with the highest factor to avoid combining and recombining. 1262343171Sdim for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) { 1263343171Sdim std::list<VectorInfo> Candidates; 1264343171Sdim 1265343171Sdim for (BasicBlock &BB : F) { 1266343171Sdim for (Instruction &I : BB) { 1267343171Sdim if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) { 1268343171Sdim 1269343171Sdim Candidates.emplace_back(SVI->getType()); 1270343171Sdim 1271343171Sdim if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) { 1272343171Sdim Candidates.pop_back(); 1273343171Sdim continue; 1274343171Sdim } 1275343171Sdim 1276343171Sdim if (!Candidates.back().isInterleaved(Factor, DL)) { 1277343171Sdim Candidates.pop_back(); 1278343171Sdim } 1279343171Sdim } 1280343171Sdim } 1281343171Sdim } 1282343171Sdim 1283343171Sdim std::list<VectorInfo> InterleavedLoad; 1284343171Sdim while (findPattern(Candidates, InterleavedLoad, Factor, DL)) { 1285343171Sdim if (combine(InterleavedLoad, ORE)) { 1286343171Sdim changed = true; 1287343171Sdim } else { 1288343171Sdim // Remove the first element of the Interleaved Load but put the others 1289343171Sdim // back on the list and continue searching 1290343171Sdim Candidates.splice(Candidates.begin(), InterleavedLoad, 1291343171Sdim std::next(InterleavedLoad.begin()), 1292343171Sdim InterleavedLoad.end()); 1293343171Sdim } 1294343171Sdim InterleavedLoad.clear(); 1295343171Sdim } 1296343171Sdim } 1297343171Sdim 1298343171Sdim return changed; 1299343171Sdim} 1300343171Sdim 1301343171Sdimnamespace { 1302343171Sdim/// This pass combines interleaved loads into a pattern detectable by 1303343171Sdim/// InterleavedAccessPass. 1304343171Sdimstruct InterleavedLoadCombine : public FunctionPass { 1305343171Sdim static char ID; 1306343171Sdim 1307343171Sdim InterleavedLoadCombine() : FunctionPass(ID) { 1308343171Sdim initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry()); 1309343171Sdim } 1310343171Sdim 1311343171Sdim StringRef getPassName() const override { 1312343171Sdim return "Interleaved Load Combine Pass"; 1313343171Sdim } 1314343171Sdim 1315343171Sdim bool runOnFunction(Function &F) override { 1316343171Sdim if (DisableInterleavedLoadCombine) 1317343171Sdim return false; 1318343171Sdim 1319343171Sdim auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 1320343171Sdim if (!TPC) 1321343171Sdim return false; 1322343171Sdim 1323343171Sdim LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() 1324343171Sdim << "\n"); 1325343171Sdim 1326343171Sdim return InterleavedLoadCombineImpl( 1327343171Sdim F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), 1328343171Sdim getAnalysis<MemorySSAWrapperPass>().getMSSA(), 1329343171Sdim TPC->getTM<TargetMachine>()) 1330343171Sdim .run(); 1331343171Sdim } 1332343171Sdim 1333343171Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 1334343171Sdim AU.addRequired<MemorySSAWrapperPass>(); 1335343171Sdim AU.addRequired<DominatorTreeWrapperPass>(); 1336343171Sdim FunctionPass::getAnalysisUsage(AU); 1337343171Sdim } 1338343171Sdim 1339343171Sdimprivate: 1340343171Sdim}; 1341343171Sdim} // anonymous namespace 1342343171Sdim 1343343171Sdimchar InterleavedLoadCombine::ID = 0; 1344343171Sdim 1345343171SdimINITIALIZE_PASS_BEGIN( 1346343171Sdim InterleavedLoadCombine, DEBUG_TYPE, 1347343171Sdim "Combine interleaved loads into wide loads and shufflevector instructions", 1348343171Sdim false, false) 1349343171SdimINITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1350343171SdimINITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 1351343171SdimINITIALIZE_PASS_END( 1352343171Sdim InterleavedLoadCombine, DEBUG_TYPE, 1353343171Sdim "Combine interleaved loads into wide loads and shufflevector instructions", 1354343171Sdim false, false) 1355343171Sdim 1356343171SdimFunctionPass * 1357343171Sdimllvm::createInterleavedLoadCombinePass() { 1358343171Sdim auto P = new InterleavedLoadCombine(); 1359343171Sdim return P; 1360343171Sdim} 1361