1//===- X86InterleavedAccess.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains the X86 implementation of the interleaved accesses
11/// optimization generating X86-specific instructions/intrinsics for
12/// interleaved access groups.
13//
14//===----------------------------------------------------------------------===//
15
16#include "X86ISelLowering.h"
17#include "X86Subtarget.h"
18#include "llvm/ADT/ArrayRef.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/Analysis/VectorUtils.h"
21#include "llvm/IR/Constants.h"
22#include "llvm/IR/DataLayout.h"
23#include "llvm/IR/DerivedTypes.h"
24#include "llvm/IR/IRBuilder.h"
25#include "llvm/IR/Instruction.h"
26#include "llvm/IR/Instructions.h"
27#include "llvm/IR/Module.h"
28#include "llvm/IR/Type.h"
29#include "llvm/IR/Value.h"
30#include "llvm/Support/Casting.h"
31#include "llvm/Support/MachineValueType.h"
32#include <algorithm>
33#include <cassert>
34#include <cmath>
35#include <cstdint>
36
37using namespace llvm;
38
39namespace {
40
41/// This class holds necessary information to represent an interleaved
42/// access group and supports utilities to lower the group into
43/// X86-specific instructions/intrinsics.
44///  E.g. A group of interleaving access loads (Factor = 2; accessing every
45///       other element)
46///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
47///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
48///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
49class X86InterleavedAccessGroup {
50  /// Reference to the wide-load instruction of an interleaved access
51  /// group.
52  Instruction *const Inst;
53
54  /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
55  ArrayRef<ShuffleVectorInst *> Shuffles;
56
57  /// Reference to the starting index of each user-shuffle.
58  ArrayRef<unsigned> Indices;
59
60  /// Reference to the interleaving stride in terms of elements.
61  const unsigned Factor;
62
63  /// Reference to the underlying target.
64  const X86Subtarget &Subtarget;
65
66  const DataLayout &DL;
67
68  IRBuilder<> &Builder;
69
70  /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
71  /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
72  void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
73                 SmallVectorImpl<Instruction *> &DecomposedVectors);
74
75  /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
76  /// returns the transposed-vectors in \p TransposedVectors.
77  /// E.g.
78  /// InputVectors:
79  ///   In-V0 = p1, p2, p3, p4
80  ///   In-V1 = q1, q2, q3, q4
81  ///   In-V2 = r1, r2, r3, r4
82  ///   In-V3 = s1, s2, s3, s4
83  /// OutputVectors:
84  ///   Out-V0 = p1, q1, r1, s1
85  ///   Out-V1 = p2, q2, r2, s2
86  ///   Out-V2 = p3, q3, r3, s3
87  ///   Out-V3 = P4, q4, r4, s4
88  void transpose_4x4(ArrayRef<Instruction *> InputVectors,
89                     SmallVectorImpl<Value *> &TransposedMatrix);
90  void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
91                             SmallVectorImpl<Value *> &TransposedMatrix,
92                             unsigned NumSubVecElems);
93  void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
94                                SmallVectorImpl<Value *> &TransposedMatrix);
95  void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96                             SmallVectorImpl<Value *> &TransposedMatrix,
97                             unsigned NumSubVecElems);
98  void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
99                               SmallVectorImpl<Value *> &TransposedMatrix,
100                               unsigned NumSubVecElems);
101
102public:
103  /// In order to form an interleaved access group X86InterleavedAccessGroup
104  /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
105  /// \p Shuffs, reference to the first indices of each interleaved-vector
106  /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
107  /// X86-specific instructions/intrinsics it also requires the underlying
108  /// target information \p STarget.
109  explicit X86InterleavedAccessGroup(Instruction *I,
110                                     ArrayRef<ShuffleVectorInst *> Shuffs,
111                                     ArrayRef<unsigned> Ind, const unsigned F,
112                                     const X86Subtarget &STarget,
113                                     IRBuilder<> &B)
114      : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
115        DL(Inst->getModule()->getDataLayout()), Builder(B) {}
116
117  /// Returns true if this interleaved access group can be lowered into
118  /// x86-specific instructions/intrinsics, false otherwise.
119  bool isSupported() const;
120
121  /// Lowers this interleaved access group into X86-specific
122  /// instructions/intrinsics.
123  bool lowerIntoOptimizedSequence();
124};
125
126} // end anonymous namespace
127
128bool X86InterleavedAccessGroup::isSupported() const {
129  VectorType *ShuffleVecTy = Shuffles[0]->getType();
130  Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
131  unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
132  unsigned WideInstSize;
133
134  // Currently, lowering is supported for the following vectors:
135  // Stride 4:
136  //    1. Store and load of 4-element vectors of 64 bits on AVX.
137  //    2. Store of 16/32-element vectors of 8 bits on AVX.
138  // Stride 3:
139  //    1. Load of 16/32-element vectors of 8 bits on AVX.
140  if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
141    return false;
142
143  if (isa<LoadInst>(Inst)) {
144    WideInstSize = DL.getTypeSizeInBits(Inst->getType());
145    if (cast<LoadInst>(Inst)->getPointerAddressSpace())
146      return false;
147  } else
148    WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
149
150  // We support shuffle represents stride 4 for byte type with size of
151  // WideInstSize.
152  if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
153     return true;
154
155  if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156      (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157       WideInstSize == 2048))
158    return true;
159
160  if (ShuffleElemSize == 8 && Factor == 3 &&
161      (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
162    return true;
163
164  return false;
165}
166
167void X86InterleavedAccessGroup::decompose(
168    Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
169    SmallVectorImpl<Instruction *> &DecomposedVectors) {
170  assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171         "Expected Load or Shuffle");
172
173  Type *VecWidth = VecInst->getType();
174  (void)VecWidth;
175  assert(VecWidth->isVectorTy() &&
176         DL.getTypeSizeInBits(VecWidth) >=
177             DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178         "Invalid Inst-size!!!");
179
180  if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181    Value *Op0 = SVI->getOperand(0);
182    Value *Op1 = SVI->getOperand(1);
183
184    // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
185    for (unsigned i = 0; i < NumSubVectors; ++i)
186      DecomposedVectors.push_back(
187          cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
188              Op0, Op1,
189              createSequentialMask(Builder, Indices[i],
190                                   SubVecTy->getVectorNumElements(), 0))));
191    return;
192  }
193
194  // Decompose the load instruction.
195  LoadInst *LI = cast<LoadInst>(VecInst);
196  Type *VecBaseTy, *VecBasePtrTy;
197  Value *VecBasePtr;
198  unsigned int NumLoads = NumSubVectors;
199  // In the case of stride 3 with a vector of 32 elements load the information
200  // in the following way:
201  // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
202  unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
203  if (VecLength == 768 || VecLength == 1536) {
204    VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
205    VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
206    VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
207    NumLoads = NumSubVectors * (VecLength / 384);
208  } else {
209    VecBaseTy = SubVecTy;
210    VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
211    VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
212  }
213  // Generate N loads of T type.
214  for (unsigned i = 0; i < NumLoads; i++) {
215    // TODO: Support inbounds GEP.
216    Value *NewBasePtr =
217        Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
218    Instruction *NewLoad =
219        Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
220    DecomposedVectors.push_back(NewLoad);
221  }
222}
223
224// Changing the scale of the vector type by reducing the number of elements and
225// doubling the scalar size.
226static MVT scaleVectorType(MVT VT) {
227  unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
228  return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
229                          VT.getVectorNumElements() / 2);
230}
231
232static uint32_t Concat[] = {
233  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
234  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
235  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
236  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
237
238// genShuffleBland - Creates shuffle according to two vectors.This function is
239// only works on instructions with lane inside 256 registers. According to
240// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
241// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
242// Where the 'LowOffset' refers to the first vector and the highOffset refers to
243// the second vector.
244// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
245// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
246// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
247// For the sequence to work as a mirror to the load.
248// We must consider the elements order as above.
249// In this function we are combining two types of shuffles.
250// The first one is vpshufed and the second is a type of "blend" shuffle.
251// By computing the shuffle on a sequence of 16 elements(one lane) and add the
252// correct offset. We are creating a vpsuffed + blend sequence between two
253// shuffles.
254static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
255  SmallVectorImpl<uint32_t> &Out, int LowOffset,
256  int HighOffset) {
257  assert(VT.getSizeInBits() >= 256 &&
258    "This function doesn't accept width smaller then 256");
259  unsigned NumOfElm = VT.getVectorNumElements();
260  for (unsigned i = 0; i < Mask.size(); i++)
261    Out.push_back(Mask[i] + LowOffset);
262  for (unsigned i = 0; i < Mask.size(); i++)
263    Out.push_back(Mask[i] + HighOffset + NumOfElm);
264}
265
266// reorderSubVector returns the data to is the original state. And de-facto is
267// the opposite of  the function concatSubVector.
268
269// For VecElems = 16
270// Invec[0] -  |0|      TransposedMatrix[0] - |0|
271// Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
272// Invec[2] -  |2|      TransposedMatrix[2] - |2|
273
274// For VecElems = 32
275// Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
276// Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
277// Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
278
279// For VecElems = 64
280// Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
281// Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
282// Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
283
284static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
285  ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
286  unsigned VecElems, unsigned Stride,
287  IRBuilder<> Builder) {
288
289  if (VecElems == 16) {
290    for (unsigned i = 0; i < Stride; i++)
291      TransposedMatrix[i] = Builder.CreateShuffleVector(
292        Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
293    return;
294  }
295
296  SmallVector<uint32_t, 32> OptimizeShuf;
297  Value *Temp[8];
298
299  for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
300    genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
301      (i + 1) / Stride * 16);
302    Temp[i / 2] = Builder.CreateShuffleVector(
303      Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
304    OptimizeShuf.clear();
305  }
306
307  if (VecElems == 32) {
308    std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
309    return;
310  }
311  else
312    for (unsigned i = 0; i < Stride; i++)
313      TransposedMatrix[i] =
314      Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
315}
316
317void X86InterleavedAccessGroup::interleave8bitStride4VF8(
318    ArrayRef<Instruction *> Matrix,
319    SmallVectorImpl<Value *> &TransposedMatrix) {
320  // Assuming we start from the following vectors:
321  // Matrix[0]= c0 c1 c2 c3 c4 ... c7
322  // Matrix[1]= m0 m1 m2 m3 m4 ... m7
323  // Matrix[2]= y0 y1 y2 y3 y4 ... y7
324  // Matrix[3]= k0 k1 k2 k3 k4 ... k7
325
326  MVT VT = MVT::v8i16;
327  TransposedMatrix.resize(2);
328  SmallVector<uint32_t, 16> MaskLow;
329  SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
330  SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
331
332  for (unsigned i = 0; i < 8; ++i) {
333    MaskLow.push_back(i);
334    MaskLow.push_back(i + 8);
335  }
336
337  createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
338  createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
339  scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
340  scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
341  // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
342  // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
343  Value *IntrVec1Low =
344      Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
345  Value *IntrVec2Low =
346      Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
347
348  // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
349  // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
350
351  TransposedMatrix[0] =
352      Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
353  TransposedMatrix[1] =
354      Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
355}
356
357void X86InterleavedAccessGroup::interleave8bitStride4(
358    ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
359    unsigned NumOfElm) {
360  // Example: Assuming we start from the following vectors:
361  // Matrix[0]= c0 c1 c2 c3 c4 ... c31
362  // Matrix[1]= m0 m1 m2 m3 m4 ... m31
363  // Matrix[2]= y0 y1 y2 y3 y4 ... y31
364  // Matrix[3]= k0 k1 k2 k3 k4 ... k31
365
366  MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
367  MVT HalfVT = scaleVectorType(VT);
368
369  TransposedMatrix.resize(4);
370  SmallVector<uint32_t, 32> MaskHigh;
371  SmallVector<uint32_t, 32> MaskLow;
372  SmallVector<uint32_t, 32> LowHighMask[2];
373  SmallVector<uint32_t, 32> MaskHighTemp;
374  SmallVector<uint32_t, 32> MaskLowTemp;
375
376  // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
377  // shuffle pattern.
378
379  createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
380  createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
381
382  // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
383  // shuffle pattern.
384
385  createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
386  createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
387  scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
388  scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
389
390  // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
391  // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
392  // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
393  // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
394  Value *IntrVec[4];
395
396  IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
397  IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
398  IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
399  IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
400
401  // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
402  // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
403  // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
404  // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
405
406  Value *VecOut[4];
407  for (int i = 0; i < 4; i++)
408    VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
409                                            LowHighMask[i % 2]);
410
411  // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
412  // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
413  // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
414  // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
415
416  if (VT == MVT::v16i8) {
417    std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
418    return;
419  }
420
421  reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
422                   NumOfElm, 4, Builder);
423}
424
425//  createShuffleStride returns shuffle mask of size N.
426//  The shuffle pattern is as following :
427//  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
428//  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
429//  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
430//  Where Lane is the # of lanes in a register:
431//  VectorSize = 128 => Lane = 1
432//  VectorSize = 256 => Lane = 2
433//  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
434//  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
435static void createShuffleStride(MVT VT, int Stride,
436                                SmallVectorImpl<uint32_t> &Mask) {
437  int VectorSize = VT.getSizeInBits();
438  int VF = VT.getVectorNumElements();
439  int LaneCount = std::max(VectorSize / 128, 1);
440  for (int Lane = 0; Lane < LaneCount; Lane++)
441    for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
442      Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
443}
444
445//  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
446//  inside mask a shuffleMask. A mask contains exactly 3 groups, where
447//  each group is a monotonically increasing sequence with stride 3.
448//  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
449static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
450  int VectorSize = VT.getSizeInBits();
451  int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
452  for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
453    int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
454    SizeInfo.push_back(GroupSize);
455    FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
456  }
457}
458
459//  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
460//  vpalign works according to lanes
461//  Where Lane is the # of lanes in a register:
462//  VectorWide = 128 => Lane = 1
463//  VectorWide = 256 => Lane = 2
464//  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
465//  For Lane = 2 shuffle pattern is:
466//  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
467//  Imm variable sets the offset amount. The result of the
468//  function is stored inside ShuffleMask vector and it built as described in
469//  the begin of the description. AlignDirection is a boolean that indicates the
470//  direction of the alignment. (false - align to the "right" side while true -
471//  align to the "left" side)
472static void DecodePALIGNRMask(MVT VT, unsigned Imm,
473                              SmallVectorImpl<uint32_t> &ShuffleMask,
474                              bool AlignDirection = true, bool Unary = false) {
475  unsigned NumElts = VT.getVectorNumElements();
476  unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
477  unsigned NumLaneElts = NumElts / NumLanes;
478
479  Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
480  unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
481
482  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
483    for (unsigned i = 0; i != NumLaneElts; ++i) {
484      unsigned Base = i + Offset;
485      // if i+offset is out of this lane then we actually need the other source
486      // If Unary the other source is the first source.
487      if (Base >= NumLaneElts)
488        Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
489      ShuffleMask.push_back(Base + l);
490    }
491  }
492}
493
494// concatSubVector - The function rebuilds the data to a correct expected
495// order. An assumption(The shape of the matrix) was taken for the
496// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
497// This function ensures that the data is built in correct way for the lane
498// instructions. Each lane inside the vector is a 128-bit length.
499//
500// The 'InVec' argument contains the data in increasing order. In InVec[0] You
501// can find the first 128 bit data. The number of different lanes inside a
502// vector depends on the 'VecElems'.In general, the formula is
503// VecElems * type / 128. The size of the array 'InVec' depends and equal to
504// 'VecElems'.
505
506// For VecElems = 16
507// Invec[0] - |0|      Vec[0] - |0|
508// Invec[1] - |1|  =>  Vec[1] - |1|
509// Invec[2] - |2|      Vec[2] - |2|
510
511// For VecElems = 32
512// Invec[0] - |0|1|      Vec[0] - |0|3|
513// Invec[1] - |2|3|  =>  Vec[1] - |1|4|
514// Invec[2] - |4|5|      Vec[2] - |2|5|
515
516// For VecElems = 64
517// Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
518// Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
519// Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
520
521static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
522                            unsigned VecElems, IRBuilder<> Builder) {
523  if (VecElems == 16) {
524    for (int i = 0; i < 3; i++)
525      Vec[i] = InVec[i];
526    return;
527  }
528
529  for (unsigned j = 0; j < VecElems / 32; j++)
530    for (int i = 0; i < 3; i++)
531      Vec[i + j * 3] = Builder.CreateShuffleVector(
532          InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
533
534  if (VecElems == 32)
535    return;
536
537  for (int i = 0; i < 3; i++)
538    Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
539}
540
541void X86InterleavedAccessGroup::deinterleave8bitStride3(
542    ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
543    unsigned VecElems) {
544  // Example: Assuming we start from the following vectors:
545  // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
546  // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
547  // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
548
549  TransposedMatrix.resize(3);
550  SmallVector<uint32_t, 32> VPShuf;
551  SmallVector<uint32_t, 32> VPAlign[2];
552  SmallVector<uint32_t, 32> VPAlign2;
553  SmallVector<uint32_t, 32> VPAlign3;
554  SmallVector<uint32_t, 3> GroupSize;
555  Value *Vec[6], *TempVector[3];
556
557  MVT VT = MVT::getVT(Shuffles[0]->getType());
558
559  createShuffleStride(VT, 3, VPShuf);
560  setGroupSize(VT, GroupSize);
561
562  for (int i = 0; i < 2; i++)
563    DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
564
565  DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
566  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
567
568  concatSubVector(Vec, InVec, VecElems, Builder);
569  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
570  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
571  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
572
573  for (int i = 0; i < 3; i++)
574    Vec[i] = Builder.CreateShuffleVector(
575        Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
576
577  // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
578  // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
579  // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
580
581  for (int i = 0; i < 3; i++)
582    TempVector[i] =
583        Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
584
585  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
586  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
587  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
588
589  for (int i = 0; i < 3; i++)
590    Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
591                                         VPAlign[1]);
592
593  // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
594  // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
595  // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
596
597  Value *TempVec = Builder.CreateShuffleVector(
598      Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
599  TransposedMatrix[0] = Builder.CreateShuffleVector(
600      Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
601  TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
602  TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
603}
604
605// group2Shuffle reorder the shuffle stride back into continuous order.
606// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
607// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
608static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
609                          SmallVectorImpl<uint32_t> &Output) {
610  int IndexGroup[3] = {0, 0, 0};
611  int Index = 0;
612  int VectorWidth = VT.getSizeInBits();
613  int VF = VT.getVectorNumElements();
614  // Find the index of the different groups.
615  int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
616  for (int i = 0; i < 3; i++) {
617    IndexGroup[(Index * 3) % (VF / Lane)] = Index;
618    Index += Mask[i];
619  }
620  // According to the index compute the convert mask.
621  for (int i = 0; i < VF / Lane; i++) {
622    Output.push_back(IndexGroup[i % 3]);
623    IndexGroup[i % 3]++;
624  }
625}
626
627void X86InterleavedAccessGroup::interleave8bitStride3(
628    ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
629    unsigned VecElems) {
630  // Example: Assuming we start from the following vectors:
631  // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
632  // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
633  // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
634
635  TransposedMatrix.resize(3);
636  SmallVector<uint32_t, 3> GroupSize;
637  SmallVector<uint32_t, 32> VPShuf;
638  SmallVector<uint32_t, 32> VPAlign[3];
639  SmallVector<uint32_t, 32> VPAlign2;
640  SmallVector<uint32_t, 32> VPAlign3;
641
642  Value *Vec[3], *TempVector[3];
643  MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
644
645  setGroupSize(VT, GroupSize);
646
647  for (int i = 0; i < 3; i++)
648    DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
649
650  DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
651  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
652
653  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
654  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
655  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
656
657  Vec[0] = Builder.CreateShuffleVector(
658      InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
659  Vec[1] = Builder.CreateShuffleVector(
660      InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
661  Vec[2] = InVec[2];
662
663  // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
664  // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
665  // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
666
667  for (int i = 0; i < 3; i++)
668    TempVector[i] =
669        Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
670
671  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
672  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
673  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
674
675  for (int i = 0; i < 3; i++)
676    Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
677                                         VPAlign[2]);
678
679  // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
680  // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
681  // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
682
683  unsigned NumOfElm = VT.getVectorNumElements();
684  group2Shuffle(VT, GroupSize, VPShuf);
685  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
686}
687
688void X86InterleavedAccessGroup::transpose_4x4(
689    ArrayRef<Instruction *> Matrix,
690    SmallVectorImpl<Value *> &TransposedMatrix) {
691  assert(Matrix.size() == 4 && "Invalid matrix size");
692  TransposedMatrix.resize(4);
693
694  // dst = src1[0,1],src2[0,1]
695  uint32_t IntMask1[] = {0, 1, 4, 5};
696  ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
697  Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
698  Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
699
700  // dst = src1[2,3],src2[2,3]
701  uint32_t IntMask2[] = {2, 3, 6, 7};
702  Mask = makeArrayRef(IntMask2, 4);
703  Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
704  Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
705
706  // dst = src1[0],src2[0],src1[2],src2[2]
707  uint32_t IntMask3[] = {0, 4, 2, 6};
708  Mask = makeArrayRef(IntMask3, 4);
709  TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
710  TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
711
712  // dst = src1[1],src2[1],src1[3],src2[3]
713  uint32_t IntMask4[] = {1, 5, 3, 7};
714  Mask = makeArrayRef(IntMask4, 4);
715  TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
716  TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
717}
718
719// Lowers this interleaved access group into X86-specific
720// instructions/intrinsics.
721bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
722  SmallVector<Instruction *, 4> DecomposedVectors;
723  SmallVector<Value *, 4> TransposedVectors;
724  VectorType *ShuffleTy = Shuffles[0]->getType();
725
726  if (isa<LoadInst>(Inst)) {
727    // Try to generate target-sized register(/instruction).
728    decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
729
730    Type *ShuffleEltTy = Inst->getType();
731    unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
732    // Perform matrix-transposition in order to compute interleaved
733    // results by generating some sort of (optimized) target-specific
734    // instructions.
735
736    switch (NumSubVecElems) {
737    default:
738      return false;
739    case 4:
740      transpose_4x4(DecomposedVectors, TransposedVectors);
741      break;
742    case 8:
743    case 16:
744    case 32:
745    case 64:
746      deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
747                              NumSubVecElems);
748      break;
749    }
750
751    // Now replace the unoptimized-interleaved-vectors with the
752    // transposed-interleaved vectors.
753    for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
754      Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
755
756    return true;
757  }
758
759  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
760  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
761
762  // Lower the interleaved stores:
763  //   1. Decompose the interleaved wide shuffle into individual shuffle
764  //   vectors.
765  decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
766            DecomposedVectors);
767
768  //   2. Transpose the interleaved-vectors into vectors of contiguous
769  //      elements.
770  switch (NumSubVecElems) {
771  case 4:
772    transpose_4x4(DecomposedVectors, TransposedVectors);
773    break;
774  case 8:
775    interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
776    break;
777  case 16:
778  case 32:
779  case 64:
780    if (Factor == 4)
781      interleave8bitStride4(DecomposedVectors, TransposedVectors,
782                            NumSubVecElems);
783    if (Factor == 3)
784      interleave8bitStride3(DecomposedVectors, TransposedVectors,
785                            NumSubVecElems);
786    break;
787  default:
788    return false;
789  }
790
791  //   3. Concatenate the contiguous-vectors back into a wide vector.
792  Value *WideVec = concatenateVectors(Builder, TransposedVectors);
793
794  //   4. Generate a store instruction for wide-vec.
795  StoreInst *SI = cast<StoreInst>(Inst);
796  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
797                             SI->getAlignment());
798
799  return true;
800}
801
802// Lower interleaved load(s) into target specific instructions/
803// intrinsics. Lowering sequence varies depending on the vector-types, factor,
804// number of shuffles and ISA.
805// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
806bool X86TargetLowering::lowerInterleavedLoad(
807    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
808    ArrayRef<unsigned> Indices, unsigned Factor) const {
809  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
810         "Invalid interleave factor");
811  assert(!Shuffles.empty() && "Empty shufflevector input");
812  assert(Shuffles.size() == Indices.size() &&
813         "Unmatched number of shufflevectors and indices");
814
815  // Create an interleaved access group.
816  IRBuilder<> Builder(LI);
817  X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
818                                Builder);
819
820  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
821}
822
823bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
824                                              ShuffleVectorInst *SVI,
825                                              unsigned Factor) const {
826  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
827         "Invalid interleave factor");
828
829  assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
830         "Invalid interleaved store");
831
832  // Holds the indices of SVI that correspond to the starting index of each
833  // interleaved shuffle.
834  SmallVector<unsigned, 4> Indices;
835  auto Mask = SVI->getShuffleMask();
836  for (unsigned i = 0; i < Factor; i++)
837    Indices.push_back(Mask[i]);
838
839  ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
840
841  // Create an interleaved access group.
842  IRBuilder<> Builder(SI);
843  X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
844                                Builder);
845
846  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
847}
848