1//===- X86InterleavedAccess.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains the X86 implementation of the interleaved accesses
11/// optimization generating X86-specific instructions/intrinsics for
12/// interleaved access groups.
13//
14//===----------------------------------------------------------------------===//
15
16#include "X86ISelLowering.h"
17#include "X86Subtarget.h"
18#include "llvm/ADT/ArrayRef.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/Analysis/VectorUtils.h"
21#include "llvm/IR/Constants.h"
22#include "llvm/IR/DataLayout.h"
23#include "llvm/IR/DerivedTypes.h"
24#include "llvm/IR/IRBuilder.h"
25#include "llvm/IR/Instruction.h"
26#include "llvm/IR/Instructions.h"
27#include "llvm/IR/Module.h"
28#include "llvm/IR/Type.h"
29#include "llvm/IR/Value.h"
30#include "llvm/Support/Casting.h"
31#include "llvm/Support/MachineValueType.h"
32#include <algorithm>
33#include <cassert>
34#include <cmath>
35#include <cstdint>
36
37using namespace llvm;
38
39namespace {
40
41/// This class holds necessary information to represent an interleaved
42/// access group and supports utilities to lower the group into
43/// X86-specific instructions/intrinsics.
44///  E.g. A group of interleaving access loads (Factor = 2; accessing every
45///       other element)
46///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
47///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
48///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
49class X86InterleavedAccessGroup {
50  /// Reference to the wide-load instruction of an interleaved access
51  /// group.
52  Instruction *const Inst;
53
54  /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
55  ArrayRef<ShuffleVectorInst *> Shuffles;
56
57  /// Reference to the starting index of each user-shuffle.
58  ArrayRef<unsigned> Indices;
59
60  /// Reference to the interleaving stride in terms of elements.
61  const unsigned Factor;
62
63  /// Reference to the underlying target.
64  const X86Subtarget &Subtarget;
65
66  const DataLayout &DL;
67
68  IRBuilder<> &Builder;
69
70  /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
71  /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
72  void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
73                 SmallVectorImpl<Instruction *> &DecomposedVectors);
74
75  /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
76  /// returns the transposed-vectors in \p TransposedVectors.
77  /// E.g.
78  /// InputVectors:
79  ///   In-V0 = p1, p2, p3, p4
80  ///   In-V1 = q1, q2, q3, q4
81  ///   In-V2 = r1, r2, r3, r4
82  ///   In-V3 = s1, s2, s3, s4
83  /// OutputVectors:
84  ///   Out-V0 = p1, q1, r1, s1
85  ///   Out-V1 = p2, q2, r2, s2
86  ///   Out-V2 = p3, q3, r3, s3
87  ///   Out-V3 = P4, q4, r4, s4
88  void transpose_4x4(ArrayRef<Instruction *> InputVectors,
89                     SmallVectorImpl<Value *> &TransposedMatrix);
90  void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
91                             SmallVectorImpl<Value *> &TransposedMatrix,
92                             unsigned NumSubVecElems);
93  void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
94                                SmallVectorImpl<Value *> &TransposedMatrix);
95  void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96                             SmallVectorImpl<Value *> &TransposedMatrix,
97                             unsigned NumSubVecElems);
98  void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
99                               SmallVectorImpl<Value *> &TransposedMatrix,
100                               unsigned NumSubVecElems);
101
102public:
103  /// In order to form an interleaved access group X86InterleavedAccessGroup
104  /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
105  /// \p Shuffs, reference to the first indices of each interleaved-vector
106  /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
107  /// X86-specific instructions/intrinsics it also requires the underlying
108  /// target information \p STarget.
109  explicit X86InterleavedAccessGroup(Instruction *I,
110                                     ArrayRef<ShuffleVectorInst *> Shuffs,
111                                     ArrayRef<unsigned> Ind, const unsigned F,
112                                     const X86Subtarget &STarget,
113                                     IRBuilder<> &B)
114      : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
115        DL(Inst->getModule()->getDataLayout()), Builder(B) {}
116
117  /// Returns true if this interleaved access group can be lowered into
118  /// x86-specific instructions/intrinsics, false otherwise.
119  bool isSupported() const;
120
121  /// Lowers this interleaved access group into X86-specific
122  /// instructions/intrinsics.
123  bool lowerIntoOptimizedSequence();
124};
125
126} // end anonymous namespace
127
128bool X86InterleavedAccessGroup::isSupported() const {
129  VectorType *ShuffleVecTy = Shuffles[0]->getType();
130  Type *ShuffleEltTy = ShuffleVecTy->getElementType();
131  unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
132  unsigned WideInstSize;
133
134  // Currently, lowering is supported for the following vectors:
135  // Stride 4:
136  //    1. Store and load of 4-element vectors of 64 bits on AVX.
137  //    2. Store of 16/32-element vectors of 8 bits on AVX.
138  // Stride 3:
139  //    1. Load of 16/32-element vectors of 8 bits on AVX.
140  if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
141    return false;
142
143  if (isa<LoadInst>(Inst)) {
144    WideInstSize = DL.getTypeSizeInBits(Inst->getType());
145    if (cast<LoadInst>(Inst)->getPointerAddressSpace())
146      return false;
147  } else
148    WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
149
150  // We support shuffle represents stride 4 for byte type with size of
151  // WideInstSize.
152  if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
153    return true;
154
155  if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156      (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157       WideInstSize == 2048))
158    return true;
159
160  if (ShuffleElemSize == 8 && Factor == 3 &&
161      (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
162    return true;
163
164  return false;
165}
166
167void X86InterleavedAccessGroup::decompose(
168    Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
169    SmallVectorImpl<Instruction *> &DecomposedVectors) {
170  assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171         "Expected Load or Shuffle");
172
173  Type *VecWidth = VecInst->getType();
174  (void)VecWidth;
175  assert(VecWidth->isVectorTy() &&
176         DL.getTypeSizeInBits(VecWidth) >=
177             DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178         "Invalid Inst-size!!!");
179
180  if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181    Value *Op0 = SVI->getOperand(0);
182    Value *Op1 = SVI->getOperand(1);
183
184    // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
185    for (unsigned i = 0; i < NumSubVectors; ++i)
186      DecomposedVectors.push_back(
187          cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
188              Op0, Op1,
189              createSequentialMask(Indices[i], SubVecTy->getNumElements(),
190                                   0))));
191    return;
192  }
193
194  // Decompose the load instruction.
195  LoadInst *LI = cast<LoadInst>(VecInst);
196  Type *VecBaseTy, *VecBasePtrTy;
197  Value *VecBasePtr;
198  unsigned int NumLoads = NumSubVectors;
199  // In the case of stride 3 with a vector of 32 elements load the information
200  // in the following way:
201  // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
202  unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
203  if (VecLength == 768 || VecLength == 1536) {
204    VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
205    VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
206    VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
207    NumLoads = NumSubVectors * (VecLength / 384);
208  } else {
209    VecBaseTy = SubVecTy;
210    VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
211    VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
212  }
213  // Generate N loads of T type.
214  assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() &&
215         "VecBaseTy's size must be a multiple of 8");
216  const Align FirstAlignment = LI->getAlign();
217  const Align SubsequentAlignment = commonAlignment(
218      FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8);
219  Align Alignment = FirstAlignment;
220  for (unsigned i = 0; i < NumLoads; i++) {
221    // TODO: Support inbounds GEP.
222    Value *NewBasePtr =
223        Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
224    Instruction *NewLoad =
225        Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
226    DecomposedVectors.push_back(NewLoad);
227    Alignment = SubsequentAlignment;
228  }
229}
230
231// Changing the scale of the vector type by reducing the number of elements and
232// doubling the scalar size.
233static MVT scaleVectorType(MVT VT) {
234  unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
235  return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
236                          VT.getVectorNumElements() / 2);
237}
238
239static constexpr int Concat[] = {
240    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
241    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
242    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
243    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
244
245// genShuffleBland - Creates shuffle according to two vectors.This function is
246// only works on instructions with lane inside 256 registers. According to
247// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
248// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
249// Where the 'LowOffset' refers to the first vector and the highOffset refers to
250// the second vector.
251// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
252// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
253// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
254// For the sequence to work as a mirror to the load.
255// We must consider the elements order as above.
256// In this function we are combining two types of shuffles.
257// The first one is vpshufed and the second is a type of "blend" shuffle.
258// By computing the shuffle on a sequence of 16 elements(one lane) and add the
259// correct offset. We are creating a vpsuffed + blend sequence between two
260// shuffles.
261static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
262                            SmallVectorImpl<int> &Out, int LowOffset,
263                            int HighOffset) {
264  assert(VT.getSizeInBits() >= 256 &&
265         "This function doesn't accept width smaller then 256");
266  unsigned NumOfElm = VT.getVectorNumElements();
267  for (unsigned i = 0; i < Mask.size(); i++)
268    Out.push_back(Mask[i] + LowOffset);
269  for (unsigned i = 0; i < Mask.size(); i++)
270    Out.push_back(Mask[i] + HighOffset + NumOfElm);
271}
272
273// reorderSubVector returns the data to is the original state. And de-facto is
274// the opposite of  the function concatSubVector.
275
276// For VecElems = 16
277// Invec[0] -  |0|      TransposedMatrix[0] - |0|
278// Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
279// Invec[2] -  |2|      TransposedMatrix[2] - |2|
280
281// For VecElems = 32
282// Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
283// Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
284// Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
285
286// For VecElems = 64
287// Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
288// Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
289// Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
290
291static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
292                             ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
293                             unsigned VecElems, unsigned Stride,
294                             IRBuilder<> &Builder) {
295
296  if (VecElems == 16) {
297    for (unsigned i = 0; i < Stride; i++)
298      TransposedMatrix[i] = Builder.CreateShuffleVector(
299          Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
300    return;
301  }
302
303  SmallVector<int, 32> OptimizeShuf;
304  Value *Temp[8];
305
306  for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
307    genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
308                    (i + 1) / Stride * 16);
309    Temp[i / 2] = Builder.CreateShuffleVector(
310        Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
311    OptimizeShuf.clear();
312  }
313
314  if (VecElems == 32) {
315    std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
316    return;
317  } else
318    for (unsigned i = 0; i < Stride; i++)
319      TransposedMatrix[i] =
320          Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
321}
322
323void X86InterleavedAccessGroup::interleave8bitStride4VF8(
324    ArrayRef<Instruction *> Matrix,
325    SmallVectorImpl<Value *> &TransposedMatrix) {
326  // Assuming we start from the following vectors:
327  // Matrix[0]= c0 c1 c2 c3 c4 ... c7
328  // Matrix[1]= m0 m1 m2 m3 m4 ... m7
329  // Matrix[2]= y0 y1 y2 y3 y4 ... y7
330  // Matrix[3]= k0 k1 k2 k3 k4 ... k7
331
332  MVT VT = MVT::v8i16;
333  TransposedMatrix.resize(2);
334  SmallVector<int, 16> MaskLow;
335  SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
336  SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
337
338  for (unsigned i = 0; i < 8; ++i) {
339    MaskLow.push_back(i);
340    MaskLow.push_back(i + 8);
341  }
342
343  createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
344  createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
345  narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
346  narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
347  // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
348  // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
349  Value *IntrVec1Low =
350      Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
351  Value *IntrVec2Low =
352      Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
353
354  // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
355  // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
356
357  TransposedMatrix[0] =
358      Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
359  TransposedMatrix[1] =
360      Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
361}
362
363void X86InterleavedAccessGroup::interleave8bitStride4(
364    ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
365    unsigned NumOfElm) {
366  // Example: Assuming we start from the following vectors:
367  // Matrix[0]= c0 c1 c2 c3 c4 ... c31
368  // Matrix[1]= m0 m1 m2 m3 m4 ... m31
369  // Matrix[2]= y0 y1 y2 y3 y4 ... y31
370  // Matrix[3]= k0 k1 k2 k3 k4 ... k31
371
372  MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
373  MVT HalfVT = scaleVectorType(VT);
374
375  TransposedMatrix.resize(4);
376  SmallVector<int, 32> MaskHigh;
377  SmallVector<int, 32> MaskLow;
378  SmallVector<int, 32> LowHighMask[2];
379  SmallVector<int, 32> MaskHighTemp;
380  SmallVector<int, 32> MaskLowTemp;
381
382  // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
383  // shuffle pattern.
384
385  createUnpackShuffleMask(VT, MaskLow, true, false);
386  createUnpackShuffleMask(VT, MaskHigh, false, false);
387
388  // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
389  // shuffle pattern.
390
391  createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
392  createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
393  narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
394  narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
395
396  // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
397  // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
398  // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
399  // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
400  Value *IntrVec[4];
401
402  IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
403  IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
404  IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
405  IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
406
407  // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
408  // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
409  // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
410  // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
411
412  Value *VecOut[4];
413  for (int i = 0; i < 4; i++)
414    VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
415                                            LowHighMask[i % 2]);
416
417  // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
418  // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
419  // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
420  // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
421
422  if (VT == MVT::v16i8) {
423    std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
424    return;
425  }
426
427  reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
428                   NumOfElm, 4, Builder);
429}
430
431//  createShuffleStride returns shuffle mask of size N.
432//  The shuffle pattern is as following :
433//  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
434//  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
435//  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
436//  Where Lane is the # of lanes in a register:
437//  VectorSize = 128 => Lane = 1
438//  VectorSize = 256 => Lane = 2
439//  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
440//  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
441static void createShuffleStride(MVT VT, int Stride,
442                                SmallVectorImpl<int> &Mask) {
443  int VectorSize = VT.getSizeInBits();
444  int VF = VT.getVectorNumElements();
445  int LaneCount = std::max(VectorSize / 128, 1);
446  for (int Lane = 0; Lane < LaneCount; Lane++)
447    for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
448      Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
449}
450
451//  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
452//  inside mask a shuffleMask. A mask contains exactly 3 groups, where
453//  each group is a monotonically increasing sequence with stride 3.
454//  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
455static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
456  int VectorSize = VT.getSizeInBits();
457  int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
458  for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
459    int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
460    SizeInfo.push_back(GroupSize);
461    FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
462  }
463}
464
465//  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
466//  vpalign works according to lanes
467//  Where Lane is the # of lanes in a register:
468//  VectorWide = 128 => Lane = 1
469//  VectorWide = 256 => Lane = 2
470//  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
471//  For Lane = 2 shuffle pattern is:
472//  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
473//  Imm variable sets the offset amount. The result of the
474//  function is stored inside ShuffleMask vector and it built as described in
475//  the begin of the description. AlignDirection is a boolean that indicates the
476//  direction of the alignment. (false - align to the "right" side while true -
477//  align to the "left" side)
478static void DecodePALIGNRMask(MVT VT, unsigned Imm,
479                              SmallVectorImpl<int> &ShuffleMask,
480                              bool AlignDirection = true, bool Unary = false) {
481  unsigned NumElts = VT.getVectorNumElements();
482  unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
483  unsigned NumLaneElts = NumElts / NumLanes;
484
485  Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
486  unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
487
488  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
489    for (unsigned i = 0; i != NumLaneElts; ++i) {
490      unsigned Base = i + Offset;
491      // if i+offset is out of this lane then we actually need the other source
492      // If Unary the other source is the first source.
493      if (Base >= NumLaneElts)
494        Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
495      ShuffleMask.push_back(Base + l);
496    }
497  }
498}
499
500// concatSubVector - The function rebuilds the data to a correct expected
501// order. An assumption(The shape of the matrix) was taken for the
502// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
503// This function ensures that the data is built in correct way for the lane
504// instructions. Each lane inside the vector is a 128-bit length.
505//
506// The 'InVec' argument contains the data in increasing order. In InVec[0] You
507// can find the first 128 bit data. The number of different lanes inside a
508// vector depends on the 'VecElems'.In general, the formula is
509// VecElems * type / 128. The size of the array 'InVec' depends and equal to
510// 'VecElems'.
511
512// For VecElems = 16
513// Invec[0] - |0|      Vec[0] - |0|
514// Invec[1] - |1|  =>  Vec[1] - |1|
515// Invec[2] - |2|      Vec[2] - |2|
516
517// For VecElems = 32
518// Invec[0] - |0|1|      Vec[0] - |0|3|
519// Invec[1] - |2|3|  =>  Vec[1] - |1|4|
520// Invec[2] - |4|5|      Vec[2] - |2|5|
521
522// For VecElems = 64
523// Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
524// Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
525// Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
526
527static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
528                            unsigned VecElems, IRBuilder<> &Builder) {
529  if (VecElems == 16) {
530    for (int i = 0; i < 3; i++)
531      Vec[i] = InVec[i];
532    return;
533  }
534
535  for (unsigned j = 0; j < VecElems / 32; j++)
536    for (int i = 0; i < 3; i++)
537      Vec[i + j * 3] = Builder.CreateShuffleVector(
538          InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
539
540  if (VecElems == 32)
541    return;
542
543  for (int i = 0; i < 3; i++)
544    Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
545}
546
547void X86InterleavedAccessGroup::deinterleave8bitStride3(
548    ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
549    unsigned VecElems) {
550  // Example: Assuming we start from the following vectors:
551  // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
552  // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
553  // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
554
555  TransposedMatrix.resize(3);
556  SmallVector<int, 32> VPShuf;
557  SmallVector<int, 32> VPAlign[2];
558  SmallVector<int, 32> VPAlign2;
559  SmallVector<int, 32> VPAlign3;
560  SmallVector<int, 3> GroupSize;
561  Value *Vec[6], *TempVector[3];
562
563  MVT VT = MVT::getVT(Shuffles[0]->getType());
564
565  createShuffleStride(VT, 3, VPShuf);
566  setGroupSize(VT, GroupSize);
567
568  for (int i = 0; i < 2; i++)
569    DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
570
571  DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
572  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
573
574  concatSubVector(Vec, InVec, VecElems, Builder);
575  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
576  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
577  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
578
579  for (int i = 0; i < 3; i++)
580    Vec[i] = Builder.CreateShuffleVector(
581        Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
582
583  // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
584  // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
585  // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
586
587  for (int i = 0; i < 3; i++)
588    TempVector[i] =
589        Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
590
591  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
592  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
593  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
594
595  for (int i = 0; i < 3; i++)
596    Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
597                                         VPAlign[1]);
598
599  // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
600  // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
601  // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
602
603  Value *TempVec = Builder.CreateShuffleVector(
604      Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
605  TransposedMatrix[0] = Builder.CreateShuffleVector(
606      Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
607  TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
608  TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
609}
610
611// group2Shuffle reorder the shuffle stride back into continuous order.
612// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
613// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
614static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
615                          SmallVectorImpl<int> &Output) {
616  int IndexGroup[3] = {0, 0, 0};
617  int Index = 0;
618  int VectorWidth = VT.getSizeInBits();
619  int VF = VT.getVectorNumElements();
620  // Find the index of the different groups.
621  int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
622  for (int i = 0; i < 3; i++) {
623    IndexGroup[(Index * 3) % (VF / Lane)] = Index;
624    Index += Mask[i];
625  }
626  // According to the index compute the convert mask.
627  for (int i = 0; i < VF / Lane; i++) {
628    Output.push_back(IndexGroup[i % 3]);
629    IndexGroup[i % 3]++;
630  }
631}
632
633void X86InterleavedAccessGroup::interleave8bitStride3(
634    ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
635    unsigned VecElems) {
636  // Example: Assuming we start from the following vectors:
637  // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
638  // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
639  // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
640
641  TransposedMatrix.resize(3);
642  SmallVector<int, 3> GroupSize;
643  SmallVector<int, 32> VPShuf;
644  SmallVector<int, 32> VPAlign[3];
645  SmallVector<int, 32> VPAlign2;
646  SmallVector<int, 32> VPAlign3;
647
648  Value *Vec[3], *TempVector[3];
649  MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
650
651  setGroupSize(VT, GroupSize);
652
653  for (int i = 0; i < 3; i++)
654    DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
655
656  DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
657  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
658
659  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
660  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
661  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
662
663  Vec[0] = Builder.CreateShuffleVector(
664      InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
665  Vec[1] = Builder.CreateShuffleVector(
666      InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
667  Vec[2] = InVec[2];
668
669  // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
670  // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
671  // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
672
673  for (int i = 0; i < 3; i++)
674    TempVector[i] =
675        Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
676
677  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
678  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
679  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
680
681  for (int i = 0; i < 3; i++)
682    Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
683                                         VPAlign[2]);
684
685  // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
686  // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
687  // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
688
689  unsigned NumOfElm = VT.getVectorNumElements();
690  group2Shuffle(VT, GroupSize, VPShuf);
691  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
692}
693
694void X86InterleavedAccessGroup::transpose_4x4(
695    ArrayRef<Instruction *> Matrix,
696    SmallVectorImpl<Value *> &TransposedMatrix) {
697  assert(Matrix.size() == 4 && "Invalid matrix size");
698  TransposedMatrix.resize(4);
699
700  // dst = src1[0,1],src2[0,1]
701  static constexpr int IntMask1[] = {0, 1, 4, 5};
702  ArrayRef<int> Mask = makeArrayRef(IntMask1, 4);
703  Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
704  Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
705
706  // dst = src1[2,3],src2[2,3]
707  static constexpr int IntMask2[] = {2, 3, 6, 7};
708  Mask = makeArrayRef(IntMask2, 4);
709  Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
710  Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
711
712  // dst = src1[0],src2[0],src1[2],src2[2]
713  static constexpr int IntMask3[] = {0, 4, 2, 6};
714  Mask = makeArrayRef(IntMask3, 4);
715  TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
716  TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
717
718  // dst = src1[1],src2[1],src1[3],src2[3]
719  static constexpr int IntMask4[] = {1, 5, 3, 7};
720  Mask = makeArrayRef(IntMask4, 4);
721  TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
722  TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
723}
724
725// Lowers this interleaved access group into X86-specific
726// instructions/intrinsics.
727bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
728  SmallVector<Instruction *, 4> DecomposedVectors;
729  SmallVector<Value *, 4> TransposedVectors;
730  auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
731
732  if (isa<LoadInst>(Inst)) {
733    // Try to generate target-sized register(/instruction).
734    decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
735
736    auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
737    unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
738    // Perform matrix-transposition in order to compute interleaved
739    // results by generating some sort of (optimized) target-specific
740    // instructions.
741
742    switch (NumSubVecElems) {
743    default:
744      return false;
745    case 4:
746      transpose_4x4(DecomposedVectors, TransposedVectors);
747      break;
748    case 8:
749    case 16:
750    case 32:
751    case 64:
752      deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
753                              NumSubVecElems);
754      break;
755    }
756
757    // Now replace the unoptimized-interleaved-vectors with the
758    // transposed-interleaved vectors.
759    for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
760      Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
761
762    return true;
763  }
764
765  Type *ShuffleEltTy = ShuffleTy->getElementType();
766  unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
767
768  // Lower the interleaved stores:
769  //   1. Decompose the interleaved wide shuffle into individual shuffle
770  //   vectors.
771  decompose(Shuffles[0], Factor,
772            FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
773            DecomposedVectors);
774
775  //   2. Transpose the interleaved-vectors into vectors of contiguous
776  //      elements.
777  switch (NumSubVecElems) {
778  case 4:
779    transpose_4x4(DecomposedVectors, TransposedVectors);
780    break;
781  case 8:
782    interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
783    break;
784  case 16:
785  case 32:
786  case 64:
787    if (Factor == 4)
788      interleave8bitStride4(DecomposedVectors, TransposedVectors,
789                            NumSubVecElems);
790    if (Factor == 3)
791      interleave8bitStride3(DecomposedVectors, TransposedVectors,
792                            NumSubVecElems);
793    break;
794  default:
795    return false;
796  }
797
798  //   3. Concatenate the contiguous-vectors back into a wide vector.
799  Value *WideVec = concatenateVectors(Builder, TransposedVectors);
800
801  //   4. Generate a store instruction for wide-vec.
802  StoreInst *SI = cast<StoreInst>(Inst);
803  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
804
805  return true;
806}
807
808// Lower interleaved load(s) into target specific instructions/
809// intrinsics. Lowering sequence varies depending on the vector-types, factor,
810// number of shuffles and ISA.
811// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
812bool X86TargetLowering::lowerInterleavedLoad(
813    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
814    ArrayRef<unsigned> Indices, unsigned Factor) const {
815  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
816         "Invalid interleave factor");
817  assert(!Shuffles.empty() && "Empty shufflevector input");
818  assert(Shuffles.size() == Indices.size() &&
819         "Unmatched number of shufflevectors and indices");
820
821  // Create an interleaved access group.
822  IRBuilder<> Builder(LI);
823  X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
824                                Builder);
825
826  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
827}
828
829bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
830                                              ShuffleVectorInst *SVI,
831                                              unsigned Factor) const {
832  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
833         "Invalid interleave factor");
834
835  assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
836             0 &&
837         "Invalid interleaved store");
838
839  // Holds the indices of SVI that correspond to the starting index of each
840  // interleaved shuffle.
841  SmallVector<unsigned, 4> Indices;
842  auto Mask = SVI->getShuffleMask();
843  for (unsigned i = 0; i < Factor; i++)
844    Indices.push_back(Mask[i]);
845
846  ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
847
848  // Create an interleaved access group.
849  IRBuilder<> Builder(SI);
850  X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
851                                Builder);
852
853  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
854}
855