1//===- X86InterleavedAccess.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains the X86 implementation of the interleaved accesses
11/// optimization generating X86-specific instructions/intrinsics for
12/// interleaved access groups.
13//
14//===----------------------------------------------------------------------===//
15
16#include "X86ISelLowering.h"
17#include "X86Subtarget.h"
18#include "llvm/ADT/ArrayRef.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/Analysis/VectorUtils.h"
21#include "llvm/CodeGen/MachineValueType.h"
22#include "llvm/IR/Constants.h"
23#include "llvm/IR/DataLayout.h"
24#include "llvm/IR/DerivedTypes.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/Instruction.h"
27#include "llvm/IR/Instructions.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/IR/Value.h"
31#include "llvm/Support/Casting.h"
32#include <algorithm>
33#include <cassert>
34#include <cmath>
35#include <cstdint>
36
37using namespace llvm;
38
39namespace {
40
41/// This class holds necessary information to represent an interleaved
42/// access group and supports utilities to lower the group into
43/// X86-specific instructions/intrinsics.
44///  E.g. A group of interleaving access loads (Factor = 2; accessing every
45///       other element)
46///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
47///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
48///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
49class X86InterleavedAccessGroup {
50  /// Reference to the wide-load instruction of an interleaved access
51  /// group.
52  Instruction *const Inst;
53
54  /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
55  ArrayRef<ShuffleVectorInst *> Shuffles;
56
57  /// Reference to the starting index of each user-shuffle.
58  ArrayRef<unsigned> Indices;
59
60  /// Reference to the interleaving stride in terms of elements.
61  const unsigned Factor;
62
63  /// Reference to the underlying target.
64  const X86Subtarget &Subtarget;
65
66  const DataLayout &DL;
67
68  IRBuilder<> &Builder;
69
70  /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
71  /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
72  void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
73                 SmallVectorImpl<Instruction *> &DecomposedVectors);
74
75  /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
76  /// returns the transposed-vectors in \p TransposedVectors.
77  /// E.g.
78  /// InputVectors:
79  ///   In-V0 = p1, p2, p3, p4
80  ///   In-V1 = q1, q2, q3, q4
81  ///   In-V2 = r1, r2, r3, r4
82  ///   In-V3 = s1, s2, s3, s4
83  /// OutputVectors:
84  ///   Out-V0 = p1, q1, r1, s1
85  ///   Out-V1 = p2, q2, r2, s2
86  ///   Out-V2 = p3, q3, r3, s3
87  ///   Out-V3 = P4, q4, r4, s4
88  void transpose_4x4(ArrayRef<Instruction *> InputVectors,
89                     SmallVectorImpl<Value *> &TransposedMatrix);
90  void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
91                             SmallVectorImpl<Value *> &TransposedMatrix,
92                             unsigned NumSubVecElems);
93  void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
94                                SmallVectorImpl<Value *> &TransposedMatrix);
95  void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96                             SmallVectorImpl<Value *> &TransposedMatrix,
97                             unsigned NumSubVecElems);
98  void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
99                               SmallVectorImpl<Value *> &TransposedMatrix,
100                               unsigned NumSubVecElems);
101
102public:
103  /// In order to form an interleaved access group X86InterleavedAccessGroup
104  /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
105  /// \p Shuffs, reference to the first indices of each interleaved-vector
106  /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
107  /// X86-specific instructions/intrinsics it also requires the underlying
108  /// target information \p STarget.
109  explicit X86InterleavedAccessGroup(Instruction *I,
110                                     ArrayRef<ShuffleVectorInst *> Shuffs,
111                                     ArrayRef<unsigned> Ind, const unsigned F,
112                                     const X86Subtarget &STarget,
113                                     IRBuilder<> &B)
114      : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
115        DL(Inst->getModule()->getDataLayout()), Builder(B) {}
116
117  /// Returns true if this interleaved access group can be lowered into
118  /// x86-specific instructions/intrinsics, false otherwise.
119  bool isSupported() const;
120
121  /// Lowers this interleaved access group into X86-specific
122  /// instructions/intrinsics.
123  bool lowerIntoOptimizedSequence();
124};
125
126} // end anonymous namespace
127
128bool X86InterleavedAccessGroup::isSupported() const {
129  VectorType *ShuffleVecTy = Shuffles[0]->getType();
130  Type *ShuffleEltTy = ShuffleVecTy->getElementType();
131  unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
132  unsigned WideInstSize;
133
134  // Currently, lowering is supported for the following vectors:
135  // Stride 4:
136  //    1. Store and load of 4-element vectors of 64 bits on AVX.
137  //    2. Store of 16/32-element vectors of 8 bits on AVX.
138  // Stride 3:
139  //    1. Load of 16/32-element vectors of 8 bits on AVX.
140  if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
141    return false;
142
143  if (isa<LoadInst>(Inst)) {
144    WideInstSize = DL.getTypeSizeInBits(Inst->getType());
145    if (cast<LoadInst>(Inst)->getPointerAddressSpace())
146      return false;
147  } else
148    WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
149
150  // We support shuffle represents stride 4 for byte type with size of
151  // WideInstSize.
152  if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
153    return true;
154
155  if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156      (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157       WideInstSize == 2048))
158    return true;
159
160  if (ShuffleElemSize == 8 && Factor == 3 &&
161      (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
162    return true;
163
164  return false;
165}
166
167void X86InterleavedAccessGroup::decompose(
168    Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
169    SmallVectorImpl<Instruction *> &DecomposedVectors) {
170  assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171         "Expected Load or Shuffle");
172
173  Type *VecWidth = VecInst->getType();
174  (void)VecWidth;
175  assert(VecWidth->isVectorTy() &&
176         DL.getTypeSizeInBits(VecWidth) >=
177             DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178         "Invalid Inst-size!!!");
179
180  if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181    Value *Op0 = SVI->getOperand(0);
182    Value *Op1 = SVI->getOperand(1);
183
184    // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
185    for (unsigned i = 0; i < NumSubVectors; ++i)
186      DecomposedVectors.push_back(
187          cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
188              Op0, Op1,
189              createSequentialMask(Indices[i], SubVecTy->getNumElements(),
190                                   0))));
191    return;
192  }
193
194  // Decompose the load instruction.
195  LoadInst *LI = cast<LoadInst>(VecInst);
196  Type *VecBaseTy;
197  unsigned int NumLoads = NumSubVectors;
198  // In the case of stride 3 with a vector of 32 elements load the information
199  // in the following way:
200  // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
201  unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
202  Value *VecBasePtr = LI->getPointerOperand();
203  if (VecLength == 768 || VecLength == 1536) {
204    VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
205    NumLoads = NumSubVectors * (VecLength / 384);
206  } else {
207    VecBaseTy = SubVecTy;
208  }
209  // Generate N loads of T type.
210  assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
211         "VecBaseTy's size must be a multiple of 8");
212  const Align FirstAlignment = LI->getAlign();
213  const Align SubsequentAlignment = commonAlignment(
214      FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / 8);
215  Align Alignment = FirstAlignment;
216  for (unsigned i = 0; i < NumLoads; i++) {
217    // TODO: Support inbounds GEP.
218    Value *NewBasePtr =
219        Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
220    Instruction *NewLoad =
221        Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
222    DecomposedVectors.push_back(NewLoad);
223    Alignment = SubsequentAlignment;
224  }
225}
226
227// Changing the scale of the vector type by reducing the number of elements and
228// doubling the scalar size.
229static MVT scaleVectorType(MVT VT) {
230  unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
231  return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
232                          VT.getVectorNumElements() / 2);
233}
234
235static constexpr int Concat[] = {
236    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
237    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
238    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
239    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
240
241// genShuffleBland - Creates shuffle according to two vectors.This function is
242// only works on instructions with lane inside 256 registers. According to
243// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
244// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
245// Where the 'LowOffset' refers to the first vector and the highOffset refers to
246// the second vector.
247// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
248// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
249// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
250// For the sequence to work as a mirror to the load.
251// We must consider the elements order as above.
252// In this function we are combining two types of shuffles.
253// The first one is vpshufed and the second is a type of "blend" shuffle.
254// By computing the shuffle on a sequence of 16 elements(one lane) and add the
255// correct offset. We are creating a vpsuffed + blend sequence between two
256// shuffles.
257static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
258                            SmallVectorImpl<int> &Out, int LowOffset,
259                            int HighOffset) {
260  assert(VT.getSizeInBits() >= 256 &&
261         "This function doesn't accept width smaller then 256");
262  unsigned NumOfElm = VT.getVectorNumElements();
263  for (int I : Mask)
264    Out.push_back(I + LowOffset);
265  for (int I : Mask)
266    Out.push_back(I + HighOffset + NumOfElm);
267}
268
269// reorderSubVector returns the data to is the original state. And de-facto is
270// the opposite of  the function concatSubVector.
271
272// For VecElems = 16
273// Invec[0] -  |0|      TransposedMatrix[0] - |0|
274// Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
275// Invec[2] -  |2|      TransposedMatrix[2] - |2|
276
277// For VecElems = 32
278// Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
279// Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
280// Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
281
282// For VecElems = 64
283// Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
284// Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
285// Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
286
287static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
288                             ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
289                             unsigned VecElems, unsigned Stride,
290                             IRBuilder<> &Builder) {
291
292  if (VecElems == 16) {
293    for (unsigned i = 0; i < Stride; i++)
294      TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
295    return;
296  }
297
298  SmallVector<int, 32> OptimizeShuf;
299  Value *Temp[8];
300
301  for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
302    genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
303                    (i + 1) / Stride * 16);
304    Temp[i / 2] = Builder.CreateShuffleVector(
305        Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
306    OptimizeShuf.clear();
307  }
308
309  if (VecElems == 32) {
310    std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
311    return;
312  } else
313    for (unsigned i = 0; i < Stride; i++)
314      TransposedMatrix[i] =
315          Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
316}
317
318void X86InterleavedAccessGroup::interleave8bitStride4VF8(
319    ArrayRef<Instruction *> Matrix,
320    SmallVectorImpl<Value *> &TransposedMatrix) {
321  // Assuming we start from the following vectors:
322  // Matrix[0]= c0 c1 c2 c3 c4 ... c7
323  // Matrix[1]= m0 m1 m2 m3 m4 ... m7
324  // Matrix[2]= y0 y1 y2 y3 y4 ... y7
325  // Matrix[3]= k0 k1 k2 k3 k4 ... k7
326
327  MVT VT = MVT::v8i16;
328  TransposedMatrix.resize(2);
329  SmallVector<int, 16> MaskLow;
330  SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
331  SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
332
333  for (unsigned i = 0; i < 8; ++i) {
334    MaskLow.push_back(i);
335    MaskLow.push_back(i + 8);
336  }
337
338  createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
339  createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
340  narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
341  narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
342  // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
343  // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
344  Value *IntrVec1Low =
345      Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
346  Value *IntrVec2Low =
347      Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
348
349  // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
350  // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
351
352  TransposedMatrix[0] =
353      Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
354  TransposedMatrix[1] =
355      Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
356}
357
358void X86InterleavedAccessGroup::interleave8bitStride4(
359    ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
360    unsigned NumOfElm) {
361  // Example: Assuming we start from the following vectors:
362  // Matrix[0]= c0 c1 c2 c3 c4 ... c31
363  // Matrix[1]= m0 m1 m2 m3 m4 ... m31
364  // Matrix[2]= y0 y1 y2 y3 y4 ... y31
365  // Matrix[3]= k0 k1 k2 k3 k4 ... k31
366
367  MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
368  MVT HalfVT = scaleVectorType(VT);
369
370  TransposedMatrix.resize(4);
371  SmallVector<int, 32> MaskHigh;
372  SmallVector<int, 32> MaskLow;
373  SmallVector<int, 32> LowHighMask[2];
374  SmallVector<int, 32> MaskHighTemp;
375  SmallVector<int, 32> MaskLowTemp;
376
377  // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
378  // shuffle pattern.
379
380  createUnpackShuffleMask(VT, MaskLow, true, false);
381  createUnpackShuffleMask(VT, MaskHigh, false, false);
382
383  // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
384  // shuffle pattern.
385
386  createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
387  createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
388  narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
389  narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
390
391  // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
392  // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
393  // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
394  // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
395  Value *IntrVec[4];
396
397  IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
398  IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
399  IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
400  IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
401
402  // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
403  // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
404  // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
405  // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
406
407  Value *VecOut[4];
408  for (int i = 0; i < 4; i++)
409    VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
410                                            LowHighMask[i % 2]);
411
412  // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
413  // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
414  // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
415  // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
416
417  if (VT == MVT::v16i8) {
418    std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
419    return;
420  }
421
422  reorderSubVector(VT, TransposedMatrix, VecOut, ArrayRef(Concat, 16), NumOfElm,
423                   4, Builder);
424}
425
426//  createShuffleStride returns shuffle mask of size N.
427//  The shuffle pattern is as following :
428//  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
429//  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
430//  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
431//  Where Lane is the # of lanes in a register:
432//  VectorSize = 128 => Lane = 1
433//  VectorSize = 256 => Lane = 2
434//  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
435//  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
436static void createShuffleStride(MVT VT, int Stride,
437                                SmallVectorImpl<int> &Mask) {
438  int VectorSize = VT.getSizeInBits();
439  int VF = VT.getVectorNumElements();
440  int LaneCount = std::max(VectorSize / 128, 1);
441  for (int Lane = 0; Lane < LaneCount; Lane++)
442    for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
443      Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
444}
445
446//  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
447//  inside mask a shuffleMask. A mask contains exactly 3 groups, where
448//  each group is a monotonically increasing sequence with stride 3.
449//  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
450static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
451  int VectorSize = VT.getSizeInBits();
452  int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
453  for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
454    int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
455    SizeInfo.push_back(GroupSize);
456    FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
457  }
458}
459
460//  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
461//  vpalign works according to lanes
462//  Where Lane is the # of lanes in a register:
463//  VectorWide = 128 => Lane = 1
464//  VectorWide = 256 => Lane = 2
465//  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
466//  For Lane = 2 shuffle pattern is:
467//  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
468//  Imm variable sets the offset amount. The result of the
469//  function is stored inside ShuffleMask vector and it built as described in
470//  the begin of the description. AlignDirection is a boolean that indicates the
471//  direction of the alignment. (false - align to the "right" side while true -
472//  align to the "left" side)
473static void DecodePALIGNRMask(MVT VT, unsigned Imm,
474                              SmallVectorImpl<int> &ShuffleMask,
475                              bool AlignDirection = true, bool Unary = false) {
476  unsigned NumElts = VT.getVectorNumElements();
477  unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
478  unsigned NumLaneElts = NumElts / NumLanes;
479
480  Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
481  unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
482
483  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
484    for (unsigned i = 0; i != NumLaneElts; ++i) {
485      unsigned Base = i + Offset;
486      // if i+offset is out of this lane then we actually need the other source
487      // If Unary the other source is the first source.
488      if (Base >= NumLaneElts)
489        Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
490      ShuffleMask.push_back(Base + l);
491    }
492  }
493}
494
495// concatSubVector - The function rebuilds the data to a correct expected
496// order. An assumption(The shape of the matrix) was taken for the
497// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
498// This function ensures that the data is built in correct way for the lane
499// instructions. Each lane inside the vector is a 128-bit length.
500//
501// The 'InVec' argument contains the data in increasing order. In InVec[0] You
502// can find the first 128 bit data. The number of different lanes inside a
503// vector depends on the 'VecElems'.In general, the formula is
504// VecElems * type / 128. The size of the array 'InVec' depends and equal to
505// 'VecElems'.
506
507// For VecElems = 16
508// Invec[0] - |0|      Vec[0] - |0|
509// Invec[1] - |1|  =>  Vec[1] - |1|
510// Invec[2] - |2|      Vec[2] - |2|
511
512// For VecElems = 32
513// Invec[0] - |0|1|      Vec[0] - |0|3|
514// Invec[1] - |2|3|  =>  Vec[1] - |1|4|
515// Invec[2] - |4|5|      Vec[2] - |2|5|
516
517// For VecElems = 64
518// Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
519// Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
520// Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
521
522static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
523                            unsigned VecElems, IRBuilder<> &Builder) {
524  if (VecElems == 16) {
525    for (int i = 0; i < 3; i++)
526      Vec[i] = InVec[i];
527    return;
528  }
529
530  for (unsigned j = 0; j < VecElems / 32; j++)
531    for (int i = 0; i < 3; i++)
532      Vec[i + j * 3] = Builder.CreateShuffleVector(
533          InVec[j * 6 + i], InVec[j * 6 + i + 3], ArrayRef(Concat, 32));
534
535  if (VecElems == 32)
536    return;
537
538  for (int i = 0; i < 3; i++)
539    Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
540}
541
542void X86InterleavedAccessGroup::deinterleave8bitStride3(
543    ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
544    unsigned VecElems) {
545  // Example: Assuming we start from the following vectors:
546  // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
547  // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
548  // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
549
550  TransposedMatrix.resize(3);
551  SmallVector<int, 32> VPShuf;
552  SmallVector<int, 32> VPAlign[2];
553  SmallVector<int, 32> VPAlign2;
554  SmallVector<int, 32> VPAlign3;
555  SmallVector<int, 3> GroupSize;
556  Value *Vec[6], *TempVector[3];
557
558  MVT VT = MVT::getVT(Shuffles[0]->getType());
559
560  createShuffleStride(VT, 3, VPShuf);
561  setGroupSize(VT, GroupSize);
562
563  for (int i = 0; i < 2; i++)
564    DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
565
566  DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
567  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
568
569  concatSubVector(Vec, InVec, VecElems, Builder);
570  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
571  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
572  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
573
574  for (int i = 0; i < 3; i++)
575    Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
576
577  // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
578  // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
579  // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
580
581  for (int i = 0; i < 3; i++)
582    TempVector[i] =
583        Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
584
585  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
586  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
587  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
588
589  for (int i = 0; i < 3; i++)
590    Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
591                                         VPAlign[1]);
592
593  // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
594  // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
595  // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
596
597  Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
598  TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
599  TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
600  TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
601}
602
603// group2Shuffle reorder the shuffle stride back into continuous order.
604// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
605// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
606static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
607                          SmallVectorImpl<int> &Output) {
608  int IndexGroup[3] = {0, 0, 0};
609  int Index = 0;
610  int VectorWidth = VT.getSizeInBits();
611  int VF = VT.getVectorNumElements();
612  // Find the index of the different groups.
613  int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
614  for (int i = 0; i < 3; i++) {
615    IndexGroup[(Index * 3) % (VF / Lane)] = Index;
616    Index += Mask[i];
617  }
618  // According to the index compute the convert mask.
619  for (int i = 0; i < VF / Lane; i++) {
620    Output.push_back(IndexGroup[i % 3]);
621    IndexGroup[i % 3]++;
622  }
623}
624
625void X86InterleavedAccessGroup::interleave8bitStride3(
626    ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
627    unsigned VecElems) {
628  // Example: Assuming we start from the following vectors:
629  // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
630  // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
631  // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
632
633  TransposedMatrix.resize(3);
634  SmallVector<int, 3> GroupSize;
635  SmallVector<int, 32> VPShuf;
636  SmallVector<int, 32> VPAlign[3];
637  SmallVector<int, 32> VPAlign2;
638  SmallVector<int, 32> VPAlign3;
639
640  Value *Vec[3], *TempVector[3];
641  MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
642
643  setGroupSize(VT, GroupSize);
644
645  for (int i = 0; i < 3; i++)
646    DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
647
648  DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
649  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
650
651  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
652  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
653  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
654
655  Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
656  Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
657  Vec[2] = InVec[2];
658
659  // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
660  // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
661  // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
662
663  for (int i = 0; i < 3; i++)
664    TempVector[i] =
665        Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
666
667  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
668  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
669  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
670
671  for (int i = 0; i < 3; i++)
672    Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
673                                         VPAlign[2]);
674
675  // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
676  // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
677  // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
678
679  unsigned NumOfElm = VT.getVectorNumElements();
680  group2Shuffle(VT, GroupSize, VPShuf);
681  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
682}
683
684void X86InterleavedAccessGroup::transpose_4x4(
685    ArrayRef<Instruction *> Matrix,
686    SmallVectorImpl<Value *> &TransposedMatrix) {
687  assert(Matrix.size() == 4 && "Invalid matrix size");
688  TransposedMatrix.resize(4);
689
690  // dst = src1[0,1],src2[0,1]
691  static constexpr int IntMask1[] = {0, 1, 4, 5};
692  ArrayRef<int> Mask = ArrayRef(IntMask1, 4);
693  Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
694  Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
695
696  // dst = src1[2,3],src2[2,3]
697  static constexpr int IntMask2[] = {2, 3, 6, 7};
698  Mask = ArrayRef(IntMask2, 4);
699  Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
700  Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
701
702  // dst = src1[0],src2[0],src1[2],src2[2]
703  static constexpr int IntMask3[] = {0, 4, 2, 6};
704  Mask = ArrayRef(IntMask3, 4);
705  TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
706  TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
707
708  // dst = src1[1],src2[1],src1[3],src2[3]
709  static constexpr int IntMask4[] = {1, 5, 3, 7};
710  Mask = ArrayRef(IntMask4, 4);
711  TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
712  TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
713}
714
715// Lowers this interleaved access group into X86-specific
716// instructions/intrinsics.
717bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
718  SmallVector<Instruction *, 4> DecomposedVectors;
719  SmallVector<Value *, 4> TransposedVectors;
720  auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
721
722  if (isa<LoadInst>(Inst)) {
723    auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
724    unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
725    switch (NumSubVecElems) {
726    default:
727      return false;
728    case 4:
729    case 8:
730    case 16:
731    case 32:
732    case 64:
733      if (ShuffleTy->getNumElements() != NumSubVecElems)
734        return false;
735      break;
736    }
737
738    // Try to generate target-sized register(/instruction).
739    decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
740
741    // Perform matrix-transposition in order to compute interleaved
742    // results by generating some sort of (optimized) target-specific
743    // instructions.
744
745    if (NumSubVecElems == 4)
746      transpose_4x4(DecomposedVectors, TransposedVectors);
747    else
748      deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
749                              NumSubVecElems);
750
751    // Now replace the unoptimized-interleaved-vectors with the
752    // transposed-interleaved vectors.
753    for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
754      Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
755
756    return true;
757  }
758
759  Type *ShuffleEltTy = ShuffleTy->getElementType();
760  unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
761
762  // Lower the interleaved stores:
763  //   1. Decompose the interleaved wide shuffle into individual shuffle
764  //   vectors.
765  decompose(Shuffles[0], Factor,
766            FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
767            DecomposedVectors);
768
769  //   2. Transpose the interleaved-vectors into vectors of contiguous
770  //      elements.
771  switch (NumSubVecElems) {
772  case 4:
773    transpose_4x4(DecomposedVectors, TransposedVectors);
774    break;
775  case 8:
776    interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
777    break;
778  case 16:
779  case 32:
780  case 64:
781    if (Factor == 4)
782      interleave8bitStride4(DecomposedVectors, TransposedVectors,
783                            NumSubVecElems);
784    if (Factor == 3)
785      interleave8bitStride3(DecomposedVectors, TransposedVectors,
786                            NumSubVecElems);
787    break;
788  default:
789    return false;
790  }
791
792  //   3. Concatenate the contiguous-vectors back into a wide vector.
793  Value *WideVec = concatenateVectors(Builder, TransposedVectors);
794
795  //   4. Generate a store instruction for wide-vec.
796  StoreInst *SI = cast<StoreInst>(Inst);
797  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
798
799  return true;
800}
801
802// Lower interleaved load(s) into target specific instructions/
803// intrinsics. Lowering sequence varies depending on the vector-types, factor,
804// number of shuffles and ISA.
805// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
806bool X86TargetLowering::lowerInterleavedLoad(
807    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
808    ArrayRef<unsigned> Indices, unsigned Factor) const {
809  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
810         "Invalid interleave factor");
811  assert(!Shuffles.empty() && "Empty shufflevector input");
812  assert(Shuffles.size() == Indices.size() &&
813         "Unmatched number of shufflevectors and indices");
814
815  // Create an interleaved access group.
816  IRBuilder<> Builder(LI);
817  X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
818                                Builder);
819
820  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
821}
822
823bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
824                                              ShuffleVectorInst *SVI,
825                                              unsigned Factor) const {
826  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
827         "Invalid interleave factor");
828
829  assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
830             0 &&
831         "Invalid interleaved store");
832
833  // Holds the indices of SVI that correspond to the starting index of each
834  // interleaved shuffle.
835  SmallVector<unsigned, 4> Indices;
836  auto Mask = SVI->getShuffleMask();
837  for (unsigned i = 0; i < Factor; i++)
838    Indices.push_back(Mask[i]);
839
840  ArrayRef<ShuffleVectorInst *> Shuffles = ArrayRef(SVI);
841
842  // Create an interleaved access group.
843  IRBuilder<> Builder(SI);
844  X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
845                                Builder);
846
847  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
848}
849