xref: /llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp (revision dfe43bd1ca46c59399b7cbbf81b09256232e27f9)
1 //===- X86InterleavedAccess.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains the X86 implementation of the interleaved accesses
11 /// optimization generating X86-specific instructions/intrinsics for
12 /// interleaved access groups.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86ISelLowering.h"
17 #include "X86Subtarget.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/Analysis/VectorUtils.h"
21 #include "llvm/CodeGenTypes/MachineValueType.h"
22 #include "llvm/IR/DataLayout.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/IR/Instruction.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/IR/Value.h"
29 #include "llvm/Support/Casting.h"
30 #include <algorithm>
31 #include <cassert>
32 #include <cmath>
33 
34 using namespace llvm;
35 
36 namespace {
37 
38 /// This class holds necessary information to represent an interleaved
39 /// access group and supports utilities to lower the group into
40 /// X86-specific instructions/intrinsics.
41 ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
42 ///       other element)
43 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
44 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
45 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
46 class X86InterleavedAccessGroup {
47   /// Reference to the wide-load instruction of an interleaved access
48   /// group.
49   Instruction *const Inst;
50 
51   /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
52   ArrayRef<ShuffleVectorInst *> Shuffles;
53 
54   /// Reference to the starting index of each user-shuffle.
55   ArrayRef<unsigned> Indices;
56 
57   /// Reference to the interleaving stride in terms of elements.
58   const unsigned Factor;
59 
60   /// Reference to the underlying target.
61   const X86Subtarget &Subtarget;
62 
63   const DataLayout &DL;
64 
65   IRBuilder<> &Builder;
66 
67   /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
68   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
69   void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
70                  SmallVectorImpl<Instruction *> &DecomposedVectors);
71 
72   /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
73   /// returns the transposed-vectors in \p TransposedVectors.
74   /// E.g.
75   /// InputVectors:
76   ///   In-V0 = p1, p2, p3, p4
77   ///   In-V1 = q1, q2, q3, q4
78   ///   In-V2 = r1, r2, r3, r4
79   ///   In-V3 = s1, s2, s3, s4
80   /// OutputVectors:
81   ///   Out-V0 = p1, q1, r1, s1
82   ///   Out-V1 = p2, q2, r2, s2
83   ///   Out-V2 = p3, q3, r3, s3
84   ///   Out-V3 = P4, q4, r4, s4
85   void transpose_4x4(ArrayRef<Instruction *> InputVectors,
86                      SmallVectorImpl<Value *> &TransposedMatrix);
87   void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
88                              SmallVectorImpl<Value *> &TransposedMatrix,
89                              unsigned NumSubVecElems);
90   void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
91                                 SmallVectorImpl<Value *> &TransposedMatrix);
92   void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
93                              SmallVectorImpl<Value *> &TransposedMatrix,
94                              unsigned NumSubVecElems);
95   void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96                                SmallVectorImpl<Value *> &TransposedMatrix,
97                                unsigned NumSubVecElems);
98 
99 public:
100   /// In order to form an interleaved access group X86InterleavedAccessGroup
101   /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
102   /// \p Shuffs, reference to the first indices of each interleaved-vector
103   /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
104   /// X86-specific instructions/intrinsics it also requires the underlying
105   /// target information \p STarget.
106   explicit X86InterleavedAccessGroup(Instruction *I,
107                                      ArrayRef<ShuffleVectorInst *> Shuffs,
108                                      ArrayRef<unsigned> Ind, const unsigned F,
109                                      const X86Subtarget &STarget,
110                                      IRBuilder<> &B)
111       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
112         DL(Inst->getDataLayout()), Builder(B) {}
113 
114   /// Returns true if this interleaved access group can be lowered into
115   /// x86-specific instructions/intrinsics, false otherwise.
116   bool isSupported() const;
117 
118   /// Lowers this interleaved access group into X86-specific
119   /// instructions/intrinsics.
120   bool lowerIntoOptimizedSequence();
121 };
122 
123 } // end anonymous namespace
124 
125 bool X86InterleavedAccessGroup::isSupported() const {
126   VectorType *ShuffleVecTy = Shuffles[0]->getType();
127   Type *ShuffleEltTy = ShuffleVecTy->getElementType();
128   unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
129   unsigned WideInstSize;
130 
131   // Currently, lowering is supported for the following vectors:
132   // Stride 4:
133   //    1. Store and load of 4-element vectors of 64 bits on AVX.
134   //    2. Store of 16/32-element vectors of 8 bits on AVX.
135   // Stride 3:
136   //    1. Load of 16/32-element vectors of 8 bits on AVX.
137   if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
138     return false;
139 
140   if (isa<LoadInst>(Inst)) {
141     WideInstSize = DL.getTypeSizeInBits(Inst->getType());
142     if (cast<LoadInst>(Inst)->getPointerAddressSpace())
143       return false;
144   } else
145     WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
146 
147   // We support shuffle represents stride 4 for byte type with size of
148   // WideInstSize.
149   if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
150     return true;
151 
152   if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
153       (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
154        WideInstSize == 2048))
155     return true;
156 
157   if (ShuffleElemSize == 8 && Factor == 3 &&
158       (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
159     return true;
160 
161   return false;
162 }
163 
164 void X86InterleavedAccessGroup::decompose(
165     Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
166     SmallVectorImpl<Instruction *> &DecomposedVectors) {
167   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
168          "Expected Load or Shuffle");
169 
170   Type *VecWidth = VecInst->getType();
171   (void)VecWidth;
172   assert(VecWidth->isVectorTy() &&
173          DL.getTypeSizeInBits(VecWidth) >=
174              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
175          "Invalid Inst-size!!!");
176 
177   if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
178     Value *Op0 = SVI->getOperand(0);
179     Value *Op1 = SVI->getOperand(1);
180 
181     // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
182     for (unsigned i = 0; i < NumSubVectors; ++i)
183       DecomposedVectors.push_back(
184           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
185               Op0, Op1,
186               createSequentialMask(Indices[i], SubVecTy->getNumElements(),
187                                    0))));
188     return;
189   }
190 
191   // Decompose the load instruction.
192   LoadInst *LI = cast<LoadInst>(VecInst);
193   Type *VecBaseTy;
194   unsigned int NumLoads = NumSubVectors;
195   // In the case of stride 3 with a vector of 32 elements load the information
196   // in the following way:
197   // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
198   unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
199   Value *VecBasePtr = LI->getPointerOperand();
200   if (VecLength == 768 || VecLength == 1536) {
201     VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
202     NumLoads = NumSubVectors * (VecLength / 384);
203   } else {
204     VecBaseTy = SubVecTy;
205   }
206   // Generate N loads of T type.
207   assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
208          "VecBaseTy's size must be a multiple of 8");
209   const Align FirstAlignment = LI->getAlign();
210   const Align SubsequentAlignment = commonAlignment(
211       FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / 8);
212   Align Alignment = FirstAlignment;
213   for (unsigned i = 0; i < NumLoads; i++) {
214     // TODO: Support inbounds GEP.
215     Value *NewBasePtr =
216         Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
217     Instruction *NewLoad =
218         Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
219     DecomposedVectors.push_back(NewLoad);
220     Alignment = SubsequentAlignment;
221   }
222 }
223 
224 // Changing the scale of the vector type by reducing the number of elements and
225 // doubling the scalar size.
226 static MVT scaleVectorType(MVT VT) {
227   unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
228   return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
229                           VT.getVectorNumElements() / 2);
230 }
231 
232 static constexpr int Concat[] = {
233     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
234     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
235     32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
236     48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
237 
238 // genShuffleBland - Creates shuffle according to two vectors.This function is
239 // only works on instructions with lane inside 256 registers. According to
240 // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
241 // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
242 // Where the 'LowOffset' refers to the first vector and the highOffset refers to
243 // the second vector.
244 // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
245 // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
246 // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
247 // For the sequence to work as a mirror to the load.
248 // We must consider the elements order as above.
249 // In this function we are combining two types of shuffles.
250 // The first one is vpshufed and the second is a type of "blend" shuffle.
251 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
252 // correct offset. We are creating a vpsuffed + blend sequence between two
253 // shuffles.
254 static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
255                             SmallVectorImpl<int> &Out, int LowOffset,
256                             int HighOffset) {
257   assert(VT.getSizeInBits() >= 256 &&
258          "This function doesn't accept width smaller then 256");
259   unsigned NumOfElm = VT.getVectorNumElements();
260   for (int I : Mask)
261     Out.push_back(I + LowOffset);
262   for (int I : Mask)
263     Out.push_back(I + HighOffset + NumOfElm);
264 }
265 
266 // reorderSubVector returns the data to is the original state. And de-facto is
267 // the opposite of  the function concatSubVector.
268 
269 // For VecElems = 16
270 // Invec[0] -  |0|      TransposedMatrix[0] - |0|
271 // Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
272 // Invec[2] -  |2|      TransposedMatrix[2] - |2|
273 
274 // For VecElems = 32
275 // Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
276 // Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
277 // Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
278 
279 // For VecElems = 64
280 // Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
281 // Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
282 // Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
283 
284 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
285                              ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
286                              unsigned VecElems, unsigned Stride,
287                              IRBuilder<> &Builder) {
288 
289   if (VecElems == 16) {
290     for (unsigned i = 0; i < Stride; i++)
291       TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
292     return;
293   }
294 
295   SmallVector<int, 32> OptimizeShuf;
296   Value *Temp[8];
297 
298   for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
299     genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
300                     (i + 1) / Stride * 16);
301     Temp[i / 2] = Builder.CreateShuffleVector(
302         Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
303     OptimizeShuf.clear();
304   }
305 
306   if (VecElems == 32) {
307     std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
308     return;
309   } else
310     for (unsigned i = 0; i < Stride; i++)
311       TransposedMatrix[i] =
312           Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
313 }
314 
315 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
316     ArrayRef<Instruction *> Matrix,
317     SmallVectorImpl<Value *> &TransposedMatrix) {
318   // Assuming we start from the following vectors:
319   // Matrix[0]= c0 c1 c2 c3 c4 ... c7
320   // Matrix[1]= m0 m1 m2 m3 m4 ... m7
321   // Matrix[2]= y0 y1 y2 y3 y4 ... y7
322   // Matrix[3]= k0 k1 k2 k3 k4 ... k7
323 
324   MVT VT = MVT::v8i16;
325   TransposedMatrix.resize(2);
326   SmallVector<int, 16> MaskLow;
327   SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
328   SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
329 
330   for (unsigned i = 0; i < 8; ++i) {
331     MaskLow.push_back(i);
332     MaskLow.push_back(i + 8);
333   }
334 
335   createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
336   createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
337   narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
338   narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
339   // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
340   // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
341   Value *IntrVec1Low =
342       Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
343   Value *IntrVec2Low =
344       Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
345 
346   // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
347   // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
348 
349   TransposedMatrix[0] =
350       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
351   TransposedMatrix[1] =
352       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
353 }
354 
355 void X86InterleavedAccessGroup::interleave8bitStride4(
356     ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
357     unsigned NumOfElm) {
358   // Example: Assuming we start from the following vectors:
359   // Matrix[0]= c0 c1 c2 c3 c4 ... c31
360   // Matrix[1]= m0 m1 m2 m3 m4 ... m31
361   // Matrix[2]= y0 y1 y2 y3 y4 ... y31
362   // Matrix[3]= k0 k1 k2 k3 k4 ... k31
363 
364   MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
365   MVT HalfVT = scaleVectorType(VT);
366 
367   TransposedMatrix.resize(4);
368   SmallVector<int, 32> MaskHigh;
369   SmallVector<int, 32> MaskLow;
370   SmallVector<int, 32> LowHighMask[2];
371   SmallVector<int, 32> MaskHighTemp;
372   SmallVector<int, 32> MaskLowTemp;
373 
374   // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
375   // shuffle pattern.
376 
377   createUnpackShuffleMask(VT, MaskLow, true, false);
378   createUnpackShuffleMask(VT, MaskHigh, false, false);
379 
380   // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
381   // shuffle pattern.
382 
383   createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
384   createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
385   narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
386   narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
387 
388   // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
389   // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
390   // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
391   // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
392   Value *IntrVec[4];
393 
394   IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
395   IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
396   IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
397   IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
398 
399   // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
400   // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
401   // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
402   // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
403 
404   Value *VecOut[4];
405   for (int i = 0; i < 4; i++)
406     VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
407                                             LowHighMask[i % 2]);
408 
409   // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
410   // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
411   // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
412   // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
413 
414   if (VT == MVT::v16i8) {
415     std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
416     return;
417   }
418 
419   reorderSubVector(VT, TransposedMatrix, VecOut, ArrayRef(Concat, 16), NumOfElm,
420                    4, Builder);
421 }
422 
423 //  createShuffleStride returns shuffle mask of size N.
424 //  The shuffle pattern is as following :
425 //  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
426 //  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
427 //  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
428 //  Where Lane is the # of lanes in a register:
429 //  VectorSize = 128 => Lane = 1
430 //  VectorSize = 256 => Lane = 2
431 //  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
432 //  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
433 static void createShuffleStride(MVT VT, int Stride,
434                                 SmallVectorImpl<int> &Mask) {
435   int VectorSize = VT.getSizeInBits();
436   int VF = VT.getVectorNumElements();
437   int LaneCount = std::max(VectorSize / 128, 1);
438   for (int Lane = 0; Lane < LaneCount; Lane++)
439     for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
440       Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
441 }
442 
443 //  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
444 //  inside mask a shuffleMask. A mask contains exactly 3 groups, where
445 //  each group is a monotonically increasing sequence with stride 3.
446 //  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
447 static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
448   int VectorSize = VT.getSizeInBits();
449   int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
450   for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
451     int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
452     SizeInfo.push_back(GroupSize);
453     FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
454   }
455 }
456 
457 //  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
458 //  vpalign works according to lanes
459 //  Where Lane is the # of lanes in a register:
460 //  VectorWide = 128 => Lane = 1
461 //  VectorWide = 256 => Lane = 2
462 //  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
463 //  For Lane = 2 shuffle pattern is:
464 //  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
465 //  Imm variable sets the offset amount. The result of the
466 //  function is stored inside ShuffleMask vector and it built as described in
467 //  the begin of the description. AlignDirection is a boolean that indicates the
468 //  direction of the alignment. (false - align to the "right" side while true -
469 //  align to the "left" side)
470 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
471                               SmallVectorImpl<int> &ShuffleMask,
472                               bool AlignDirection = true, bool Unary = false) {
473   unsigned NumElts = VT.getVectorNumElements();
474   unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
475   unsigned NumLaneElts = NumElts / NumLanes;
476 
477   Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
478   unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
479 
480   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
481     for (unsigned i = 0; i != NumLaneElts; ++i) {
482       unsigned Base = i + Offset;
483       // if i+offset is out of this lane then we actually need the other source
484       // If Unary the other source is the first source.
485       if (Base >= NumLaneElts)
486         Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
487       ShuffleMask.push_back(Base + l);
488     }
489   }
490 }
491 
492 // concatSubVector - The function rebuilds the data to a correct expected
493 // order. An assumption(The shape of the matrix) was taken for the
494 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
495 // This function ensures that the data is built in correct way for the lane
496 // instructions. Each lane inside the vector is a 128-bit length.
497 //
498 // The 'InVec' argument contains the data in increasing order. In InVec[0] You
499 // can find the first 128 bit data. The number of different lanes inside a
500 // vector depends on the 'VecElems'.In general, the formula is
501 // VecElems * type / 128. The size of the array 'InVec' depends and equal to
502 // 'VecElems'.
503 
504 // For VecElems = 16
505 // Invec[0] - |0|      Vec[0] - |0|
506 // Invec[1] - |1|  =>  Vec[1] - |1|
507 // Invec[2] - |2|      Vec[2] - |2|
508 
509 // For VecElems = 32
510 // Invec[0] - |0|1|      Vec[0] - |0|3|
511 // Invec[1] - |2|3|  =>  Vec[1] - |1|4|
512 // Invec[2] - |4|5|      Vec[2] - |2|5|
513 
514 // For VecElems = 64
515 // Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
516 // Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
517 // Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
518 
519 static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
520                             unsigned VecElems, IRBuilder<> &Builder) {
521   if (VecElems == 16) {
522     for (int i = 0; i < 3; i++)
523       Vec[i] = InVec[i];
524     return;
525   }
526 
527   for (unsigned j = 0; j < VecElems / 32; j++)
528     for (int i = 0; i < 3; i++)
529       Vec[i + j * 3] = Builder.CreateShuffleVector(
530           InVec[j * 6 + i], InVec[j * 6 + i + 3], ArrayRef(Concat, 32));
531 
532   if (VecElems == 32)
533     return;
534 
535   for (int i = 0; i < 3; i++)
536     Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
537 }
538 
539 void X86InterleavedAccessGroup::deinterleave8bitStride3(
540     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
541     unsigned VecElems) {
542   // Example: Assuming we start from the following vectors:
543   // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
544   // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
545   // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
546 
547   TransposedMatrix.resize(3);
548   SmallVector<int, 32> VPShuf;
549   SmallVector<int, 32> VPAlign[2];
550   SmallVector<int, 32> VPAlign2;
551   SmallVector<int, 32> VPAlign3;
552   SmallVector<int, 3> GroupSize;
553   Value *Vec[6], *TempVector[3];
554 
555   MVT VT = MVT::getVT(Shuffles[0]->getType());
556 
557   createShuffleStride(VT, 3, VPShuf);
558   setGroupSize(VT, GroupSize);
559 
560   for (int i = 0; i < 2; i++)
561     DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
562 
563   DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
564   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
565 
566   concatSubVector(Vec, InVec, VecElems, Builder);
567   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
568   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
569   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
570 
571   for (int i = 0; i < 3; i++)
572     Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
573 
574   // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
575   // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
576   // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
577 
578   for (int i = 0; i < 3; i++)
579     TempVector[i] =
580         Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
581 
582   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
583   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
584   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
585 
586   for (int i = 0; i < 3; i++)
587     Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
588                                          VPAlign[1]);
589 
590   // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
591   // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
592   // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
593 
594   Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
595   TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
596   TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
597   TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
598 }
599 
600 // group2Shuffle reorder the shuffle stride back into continuous order.
601 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
602 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
603 static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
604                           SmallVectorImpl<int> &Output) {
605   int IndexGroup[3] = {0, 0, 0};
606   int Index = 0;
607   int VectorWidth = VT.getSizeInBits();
608   int VF = VT.getVectorNumElements();
609   // Find the index of the different groups.
610   int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
611   for (int i = 0; i < 3; i++) {
612     IndexGroup[(Index * 3) % (VF / Lane)] = Index;
613     Index += Mask[i];
614   }
615   // According to the index compute the convert mask.
616   for (int i = 0; i < VF / Lane; i++) {
617     Output.push_back(IndexGroup[i % 3]);
618     IndexGroup[i % 3]++;
619   }
620 }
621 
622 void X86InterleavedAccessGroup::interleave8bitStride3(
623     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
624     unsigned VecElems) {
625   // Example: Assuming we start from the following vectors:
626   // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
627   // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
628   // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
629 
630   TransposedMatrix.resize(3);
631   SmallVector<int, 3> GroupSize;
632   SmallVector<int, 32> VPShuf;
633   SmallVector<int, 32> VPAlign[3];
634   SmallVector<int, 32> VPAlign2;
635   SmallVector<int, 32> VPAlign3;
636 
637   Value *Vec[3], *TempVector[3];
638   MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
639 
640   setGroupSize(VT, GroupSize);
641 
642   for (int i = 0; i < 3; i++)
643     DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
644 
645   DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
646   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
647 
648   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
649   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
650   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
651 
652   Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
653   Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
654   Vec[2] = InVec[2];
655 
656   // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
657   // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
658   // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
659 
660   for (int i = 0; i < 3; i++)
661     TempVector[i] =
662         Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
663 
664   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
665   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
666   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
667 
668   for (int i = 0; i < 3; i++)
669     Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
670                                          VPAlign[2]);
671 
672   // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
673   // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
674   // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
675 
676   unsigned NumOfElm = VT.getVectorNumElements();
677   group2Shuffle(VT, GroupSize, VPShuf);
678   reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
679 }
680 
681 void X86InterleavedAccessGroup::transpose_4x4(
682     ArrayRef<Instruction *> Matrix,
683     SmallVectorImpl<Value *> &TransposedMatrix) {
684   assert(Matrix.size() == 4 && "Invalid matrix size");
685   TransposedMatrix.resize(4);
686 
687   // dst = src1[0,1],src2[0,1]
688   static constexpr int IntMask1[] = {0, 1, 4, 5};
689   ArrayRef<int> Mask = ArrayRef(IntMask1, 4);
690   Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
691   Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
692 
693   // dst = src1[2,3],src2[2,3]
694   static constexpr int IntMask2[] = {2, 3, 6, 7};
695   Mask = ArrayRef(IntMask2, 4);
696   Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
697   Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
698 
699   // dst = src1[0],src2[0],src1[2],src2[2]
700   static constexpr int IntMask3[] = {0, 4, 2, 6};
701   Mask = ArrayRef(IntMask3, 4);
702   TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
703   TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
704 
705   // dst = src1[1],src2[1],src1[3],src2[3]
706   static constexpr int IntMask4[] = {1, 5, 3, 7};
707   Mask = ArrayRef(IntMask4, 4);
708   TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
709   TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
710 }
711 
712 // Lowers this interleaved access group into X86-specific
713 // instructions/intrinsics.
714 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
715   SmallVector<Instruction *, 4> DecomposedVectors;
716   SmallVector<Value *, 4> TransposedVectors;
717   auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
718 
719   if (isa<LoadInst>(Inst)) {
720     auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
721     unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
722     switch (NumSubVecElems) {
723     default:
724       return false;
725     case 4:
726     case 8:
727     case 16:
728     case 32:
729     case 64:
730       if (ShuffleTy->getNumElements() != NumSubVecElems)
731         return false;
732       break;
733     }
734 
735     // Try to generate target-sized register(/instruction).
736     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
737 
738     // Perform matrix-transposition in order to compute interleaved
739     // results by generating some sort of (optimized) target-specific
740     // instructions.
741 
742     if (NumSubVecElems == 4)
743       transpose_4x4(DecomposedVectors, TransposedVectors);
744     else
745       deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
746                               NumSubVecElems);
747 
748     // Now replace the unoptimized-interleaved-vectors with the
749     // transposed-interleaved vectors.
750     for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
751       Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
752 
753     return true;
754   }
755 
756   Type *ShuffleEltTy = ShuffleTy->getElementType();
757   unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
758 
759   // Lower the interleaved stores:
760   //   1. Decompose the interleaved wide shuffle into individual shuffle
761   //   vectors.
762   decompose(Shuffles[0], Factor,
763             FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
764             DecomposedVectors);
765 
766   //   2. Transpose the interleaved-vectors into vectors of contiguous
767   //      elements.
768   switch (NumSubVecElems) {
769   case 4:
770     transpose_4x4(DecomposedVectors, TransposedVectors);
771     break;
772   case 8:
773     interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
774     break;
775   case 16:
776   case 32:
777   case 64:
778     if (Factor == 4)
779       interleave8bitStride4(DecomposedVectors, TransposedVectors,
780                             NumSubVecElems);
781     if (Factor == 3)
782       interleave8bitStride3(DecomposedVectors, TransposedVectors,
783                             NumSubVecElems);
784     break;
785   default:
786     return false;
787   }
788 
789   //   3. Concatenate the contiguous-vectors back into a wide vector.
790   Value *WideVec = concatenateVectors(Builder, TransposedVectors);
791 
792   //   4. Generate a store instruction for wide-vec.
793   StoreInst *SI = cast<StoreInst>(Inst);
794   Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
795 
796   return true;
797 }
798 
799 // Lower interleaved load(s) into target specific instructions/
800 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
801 // number of shuffles and ISA.
802 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
803 bool X86TargetLowering::lowerInterleavedLoad(
804     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
805     ArrayRef<unsigned> Indices, unsigned Factor) const {
806   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
807          "Invalid interleave factor");
808   assert(!Shuffles.empty() && "Empty shufflevector input");
809   assert(Shuffles.size() == Indices.size() &&
810          "Unmatched number of shufflevectors and indices");
811 
812   // Create an interleaved access group.
813   IRBuilder<> Builder(LI);
814   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
815                                 Builder);
816 
817   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
818 }
819 
820 bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
821                                               ShuffleVectorInst *SVI,
822                                               unsigned Factor) const {
823   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
824          "Invalid interleave factor");
825 
826   assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
827              0 &&
828          "Invalid interleaved store");
829 
830   // Holds the indices of SVI that correspond to the starting index of each
831   // interleaved shuffle.
832   SmallVector<unsigned, 4> Indices;
833   auto Mask = SVI->getShuffleMask();
834   for (unsigned i = 0; i < Factor; i++)
835     Indices.push_back(Mask[i]);
836 
837   ArrayRef<ShuffleVectorInst *> Shuffles = ArrayRef(SVI);
838 
839   // Create an interleaved access group.
840   IRBuilder<> Builder(SI);
841   X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
842                                 Builder);
843 
844   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
845 }
846