xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===- X86InterleavedAccess.cpp -------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric /// \file
100b57cec5SDimitry Andric /// This file contains the X86 implementation of the interleaved accesses
110b57cec5SDimitry Andric /// optimization generating X86-specific instructions/intrinsics for
120b57cec5SDimitry Andric /// interleaved access groups.
130b57cec5SDimitry Andric //
140b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
150b57cec5SDimitry Andric 
160b57cec5SDimitry Andric #include "X86ISelLowering.h"
170b57cec5SDimitry Andric #include "X86Subtarget.h"
180b57cec5SDimitry Andric #include "llvm/ADT/ArrayRef.h"
190b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
200b57cec5SDimitry Andric #include "llvm/Analysis/VectorUtils.h"
21*0fca6ea1SDimitry Andric #include "llvm/CodeGenTypes/MachineValueType.h"
220b57cec5SDimitry Andric #include "llvm/IR/Constants.h"
230b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
240b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
250b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h"
260b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
270b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
280b57cec5SDimitry Andric #include "llvm/IR/Module.h"
290b57cec5SDimitry Andric #include "llvm/IR/Type.h"
300b57cec5SDimitry Andric #include "llvm/IR/Value.h"
310b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
320b57cec5SDimitry Andric #include <algorithm>
330b57cec5SDimitry Andric #include <cassert>
340b57cec5SDimitry Andric #include <cmath>
350b57cec5SDimitry Andric #include <cstdint>
360b57cec5SDimitry Andric 
370b57cec5SDimitry Andric using namespace llvm;
380b57cec5SDimitry Andric 
390b57cec5SDimitry Andric namespace {
400b57cec5SDimitry Andric 
410b57cec5SDimitry Andric /// This class holds necessary information to represent an interleaved
420b57cec5SDimitry Andric /// access group and supports utilities to lower the group into
430b57cec5SDimitry Andric /// X86-specific instructions/intrinsics.
440b57cec5SDimitry Andric ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
450b57cec5SDimitry Andric ///       other element)
460b57cec5SDimitry Andric ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
47e8d8bef9SDimitry Andric ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
48e8d8bef9SDimitry Andric ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
490b57cec5SDimitry Andric class X86InterleavedAccessGroup {
500b57cec5SDimitry Andric   /// Reference to the wide-load instruction of an interleaved access
510b57cec5SDimitry Andric   /// group.
520b57cec5SDimitry Andric   Instruction *const Inst;
530b57cec5SDimitry Andric 
540b57cec5SDimitry Andric   /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
550b57cec5SDimitry Andric   ArrayRef<ShuffleVectorInst *> Shuffles;
560b57cec5SDimitry Andric 
570b57cec5SDimitry Andric   /// Reference to the starting index of each user-shuffle.
580b57cec5SDimitry Andric   ArrayRef<unsigned> Indices;
590b57cec5SDimitry Andric 
600b57cec5SDimitry Andric   /// Reference to the interleaving stride in terms of elements.
610b57cec5SDimitry Andric   const unsigned Factor;
620b57cec5SDimitry Andric 
630b57cec5SDimitry Andric   /// Reference to the underlying target.
640b57cec5SDimitry Andric   const X86Subtarget &Subtarget;
650b57cec5SDimitry Andric 
660b57cec5SDimitry Andric   const DataLayout &DL;
670b57cec5SDimitry Andric 
680b57cec5SDimitry Andric   IRBuilder<> &Builder;
690b57cec5SDimitry Andric 
700b57cec5SDimitry Andric   /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
710b57cec5SDimitry Andric   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
725ffd83dbSDimitry Andric   void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
730b57cec5SDimitry Andric                  SmallVectorImpl<Instruction *> &DecomposedVectors);
740b57cec5SDimitry Andric 
750b57cec5SDimitry Andric   /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
760b57cec5SDimitry Andric   /// returns the transposed-vectors in \p TransposedVectors.
770b57cec5SDimitry Andric   /// E.g.
780b57cec5SDimitry Andric   /// InputVectors:
790b57cec5SDimitry Andric   ///   In-V0 = p1, p2, p3, p4
800b57cec5SDimitry Andric   ///   In-V1 = q1, q2, q3, q4
810b57cec5SDimitry Andric   ///   In-V2 = r1, r2, r3, r4
820b57cec5SDimitry Andric   ///   In-V3 = s1, s2, s3, s4
830b57cec5SDimitry Andric   /// OutputVectors:
840b57cec5SDimitry Andric   ///   Out-V0 = p1, q1, r1, s1
850b57cec5SDimitry Andric   ///   Out-V1 = p2, q2, r2, s2
860b57cec5SDimitry Andric   ///   Out-V2 = p3, q3, r3, s3
870b57cec5SDimitry Andric   ///   Out-V3 = P4, q4, r4, s4
880b57cec5SDimitry Andric   void transpose_4x4(ArrayRef<Instruction *> InputVectors,
890b57cec5SDimitry Andric                      SmallVectorImpl<Value *> &TransposedMatrix);
900b57cec5SDimitry Andric   void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
910b57cec5SDimitry Andric                              SmallVectorImpl<Value *> &TransposedMatrix,
920b57cec5SDimitry Andric                              unsigned NumSubVecElems);
930b57cec5SDimitry Andric   void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
940b57cec5SDimitry Andric                                 SmallVectorImpl<Value *> &TransposedMatrix);
950b57cec5SDimitry Andric   void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
960b57cec5SDimitry Andric                              SmallVectorImpl<Value *> &TransposedMatrix,
970b57cec5SDimitry Andric                              unsigned NumSubVecElems);
980b57cec5SDimitry Andric   void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
990b57cec5SDimitry Andric                                SmallVectorImpl<Value *> &TransposedMatrix,
1000b57cec5SDimitry Andric                                unsigned NumSubVecElems);
1010b57cec5SDimitry Andric 
1020b57cec5SDimitry Andric public:
1030b57cec5SDimitry Andric   /// In order to form an interleaved access group X86InterleavedAccessGroup
1040b57cec5SDimitry Andric   /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
1050b57cec5SDimitry Andric   /// \p Shuffs, reference to the first indices of each interleaved-vector
1060b57cec5SDimitry Andric   /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
1070b57cec5SDimitry Andric   /// X86-specific instructions/intrinsics it also requires the underlying
1080b57cec5SDimitry Andric   /// target information \p STarget.
1090b57cec5SDimitry Andric   explicit X86InterleavedAccessGroup(Instruction *I,
1100b57cec5SDimitry Andric                                      ArrayRef<ShuffleVectorInst *> Shuffs,
1110b57cec5SDimitry Andric                                      ArrayRef<unsigned> Ind, const unsigned F,
1120b57cec5SDimitry Andric                                      const X86Subtarget &STarget,
1130b57cec5SDimitry Andric                                      IRBuilder<> &B)
1140b57cec5SDimitry Andric       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
115*0fca6ea1SDimitry Andric         DL(Inst->getDataLayout()), Builder(B) {}
1160b57cec5SDimitry Andric 
1170b57cec5SDimitry Andric   /// Returns true if this interleaved access group can be lowered into
1180b57cec5SDimitry Andric   /// x86-specific instructions/intrinsics, false otherwise.
1190b57cec5SDimitry Andric   bool isSupported() const;
1200b57cec5SDimitry Andric 
1210b57cec5SDimitry Andric   /// Lowers this interleaved access group into X86-specific
1220b57cec5SDimitry Andric   /// instructions/intrinsics.
1230b57cec5SDimitry Andric   bool lowerIntoOptimizedSequence();
1240b57cec5SDimitry Andric };
1250b57cec5SDimitry Andric 
1260b57cec5SDimitry Andric } // end anonymous namespace
1270b57cec5SDimitry Andric 
1280b57cec5SDimitry Andric bool X86InterleavedAccessGroup::isSupported() const {
1290b57cec5SDimitry Andric   VectorType *ShuffleVecTy = Shuffles[0]->getType();
1305ffd83dbSDimitry Andric   Type *ShuffleEltTy = ShuffleVecTy->getElementType();
1310b57cec5SDimitry Andric   unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
1320b57cec5SDimitry Andric   unsigned WideInstSize;
1330b57cec5SDimitry Andric 
1340b57cec5SDimitry Andric   // Currently, lowering is supported for the following vectors:
1350b57cec5SDimitry Andric   // Stride 4:
1360b57cec5SDimitry Andric   //    1. Store and load of 4-element vectors of 64 bits on AVX.
1370b57cec5SDimitry Andric   //    2. Store of 16/32-element vectors of 8 bits on AVX.
1380b57cec5SDimitry Andric   // Stride 3:
1390b57cec5SDimitry Andric   //    1. Load of 16/32-element vectors of 8 bits on AVX.
1400b57cec5SDimitry Andric   if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
1410b57cec5SDimitry Andric     return false;
1420b57cec5SDimitry Andric 
1430b57cec5SDimitry Andric   if (isa<LoadInst>(Inst)) {
1440b57cec5SDimitry Andric     WideInstSize = DL.getTypeSizeInBits(Inst->getType());
1450b57cec5SDimitry Andric     if (cast<LoadInst>(Inst)->getPointerAddressSpace())
1460b57cec5SDimitry Andric       return false;
1470b57cec5SDimitry Andric   } else
1480b57cec5SDimitry Andric     WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
1490b57cec5SDimitry Andric 
1500b57cec5SDimitry Andric   // We support shuffle represents stride 4 for byte type with size of
1510b57cec5SDimitry Andric   // WideInstSize.
1520b57cec5SDimitry Andric   if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
1530b57cec5SDimitry Andric     return true;
1540b57cec5SDimitry Andric 
1550b57cec5SDimitry Andric   if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
1560b57cec5SDimitry Andric       (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
1570b57cec5SDimitry Andric        WideInstSize == 2048))
1580b57cec5SDimitry Andric     return true;
1590b57cec5SDimitry Andric 
1600b57cec5SDimitry Andric   if (ShuffleElemSize == 8 && Factor == 3 &&
1610b57cec5SDimitry Andric       (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
1620b57cec5SDimitry Andric     return true;
1630b57cec5SDimitry Andric 
1640b57cec5SDimitry Andric   return false;
1650b57cec5SDimitry Andric }
1660b57cec5SDimitry Andric 
1670b57cec5SDimitry Andric void X86InterleavedAccessGroup::decompose(
1685ffd83dbSDimitry Andric     Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
1690b57cec5SDimitry Andric     SmallVectorImpl<Instruction *> &DecomposedVectors) {
1700b57cec5SDimitry Andric   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
1710b57cec5SDimitry Andric          "Expected Load or Shuffle");
1720b57cec5SDimitry Andric 
1730b57cec5SDimitry Andric   Type *VecWidth = VecInst->getType();
1740b57cec5SDimitry Andric   (void)VecWidth;
1750b57cec5SDimitry Andric   assert(VecWidth->isVectorTy() &&
1760b57cec5SDimitry Andric          DL.getTypeSizeInBits(VecWidth) >=
1770b57cec5SDimitry Andric              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
1780b57cec5SDimitry Andric          "Invalid Inst-size!!!");
1790b57cec5SDimitry Andric 
1800b57cec5SDimitry Andric   if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
1810b57cec5SDimitry Andric     Value *Op0 = SVI->getOperand(0);
1820b57cec5SDimitry Andric     Value *Op1 = SVI->getOperand(1);
1830b57cec5SDimitry Andric 
1840b57cec5SDimitry Andric     // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
1850b57cec5SDimitry Andric     for (unsigned i = 0; i < NumSubVectors; ++i)
1860b57cec5SDimitry Andric       DecomposedVectors.push_back(
1870b57cec5SDimitry Andric           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
1880b57cec5SDimitry Andric               Op0, Op1,
1895ffd83dbSDimitry Andric               createSequentialMask(Indices[i], SubVecTy->getNumElements(),
1905ffd83dbSDimitry Andric                                    0))));
1910b57cec5SDimitry Andric     return;
1920b57cec5SDimitry Andric   }
1930b57cec5SDimitry Andric 
1940b57cec5SDimitry Andric   // Decompose the load instruction.
1950b57cec5SDimitry Andric   LoadInst *LI = cast<LoadInst>(VecInst);
19606c3fb27SDimitry Andric   Type *VecBaseTy;
1970b57cec5SDimitry Andric   unsigned int NumLoads = NumSubVectors;
1980b57cec5SDimitry Andric   // In the case of stride 3 with a vector of 32 elements load the information
1990b57cec5SDimitry Andric   // in the following way:
2000b57cec5SDimitry Andric   // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
2010b57cec5SDimitry Andric   unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
20206c3fb27SDimitry Andric   Value *VecBasePtr = LI->getPointerOperand();
2030b57cec5SDimitry Andric   if (VecLength == 768 || VecLength == 1536) {
2045ffd83dbSDimitry Andric     VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
2050b57cec5SDimitry Andric     NumLoads = NumSubVectors * (VecLength / 384);
2060b57cec5SDimitry Andric   } else {
2070b57cec5SDimitry Andric     VecBaseTy = SubVecTy;
2080b57cec5SDimitry Andric   }
2090b57cec5SDimitry Andric   // Generate N loads of T type.
210e8d8bef9SDimitry Andric   assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
2115ffd83dbSDimitry Andric          "VecBaseTy's size must be a multiple of 8");
2125ffd83dbSDimitry Andric   const Align FirstAlignment = LI->getAlign();
2135ffd83dbSDimitry Andric   const Align SubsequentAlignment = commonAlignment(
214bdd1243dSDimitry Andric       FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / 8);
2155ffd83dbSDimitry Andric   Align Alignment = FirstAlignment;
2160b57cec5SDimitry Andric   for (unsigned i = 0; i < NumLoads; i++) {
2170b57cec5SDimitry Andric     // TODO: Support inbounds GEP.
2180b57cec5SDimitry Andric     Value *NewBasePtr =
2190b57cec5SDimitry Andric         Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
2200b57cec5SDimitry Andric     Instruction *NewLoad =
2215ffd83dbSDimitry Andric         Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
2220b57cec5SDimitry Andric     DecomposedVectors.push_back(NewLoad);
2235ffd83dbSDimitry Andric     Alignment = SubsequentAlignment;
2240b57cec5SDimitry Andric   }
2250b57cec5SDimitry Andric }
2260b57cec5SDimitry Andric 
2270b57cec5SDimitry Andric // Changing the scale of the vector type by reducing the number of elements and
2280b57cec5SDimitry Andric // doubling the scalar size.
2290b57cec5SDimitry Andric static MVT scaleVectorType(MVT VT) {
2300b57cec5SDimitry Andric   unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
2310b57cec5SDimitry Andric   return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
2320b57cec5SDimitry Andric                           VT.getVectorNumElements() / 2);
2330b57cec5SDimitry Andric }
2340b57cec5SDimitry Andric 
2355ffd83dbSDimitry Andric static constexpr int Concat[] = {
2360b57cec5SDimitry Andric     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2370b57cec5SDimitry Andric     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2380b57cec5SDimitry Andric     32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2390b57cec5SDimitry Andric     48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
2400b57cec5SDimitry Andric 
2410b57cec5SDimitry Andric // genShuffleBland - Creates shuffle according to two vectors.This function is
2420b57cec5SDimitry Andric // only works on instructions with lane inside 256 registers. According to
2430b57cec5SDimitry Andric // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
2440b57cec5SDimitry Andric // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
2450b57cec5SDimitry Andric // Where the 'LowOffset' refers to the first vector and the highOffset refers to
2460b57cec5SDimitry Andric // the second vector.
2470b57cec5SDimitry Andric // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
2480b57cec5SDimitry Andric // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
2490b57cec5SDimitry Andric // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
2500b57cec5SDimitry Andric // For the sequence to work as a mirror to the load.
2510b57cec5SDimitry Andric // We must consider the elements order as above.
2520b57cec5SDimitry Andric // In this function we are combining two types of shuffles.
2530b57cec5SDimitry Andric // The first one is vpshufed and the second is a type of "blend" shuffle.
2540b57cec5SDimitry Andric // By computing the shuffle on a sequence of 16 elements(one lane) and add the
2550b57cec5SDimitry Andric // correct offset. We are creating a vpsuffed + blend sequence between two
2560b57cec5SDimitry Andric // shuffles.
2575ffd83dbSDimitry Andric static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
2585ffd83dbSDimitry Andric                             SmallVectorImpl<int> &Out, int LowOffset,
2590b57cec5SDimitry Andric                             int HighOffset) {
2600b57cec5SDimitry Andric   assert(VT.getSizeInBits() >= 256 &&
2610b57cec5SDimitry Andric          "This function doesn't accept width smaller then 256");
2620b57cec5SDimitry Andric   unsigned NumOfElm = VT.getVectorNumElements();
263bdd1243dSDimitry Andric   for (int I : Mask)
264bdd1243dSDimitry Andric     Out.push_back(I + LowOffset);
265bdd1243dSDimitry Andric   for (int I : Mask)
266bdd1243dSDimitry Andric     Out.push_back(I + HighOffset + NumOfElm);
2670b57cec5SDimitry Andric }
2680b57cec5SDimitry Andric 
2690b57cec5SDimitry Andric // reorderSubVector returns the data to is the original state. And de-facto is
2700b57cec5SDimitry Andric // the opposite of  the function concatSubVector.
2710b57cec5SDimitry Andric 
2720b57cec5SDimitry Andric // For VecElems = 16
2730b57cec5SDimitry Andric // Invec[0] -  |0|      TransposedMatrix[0] - |0|
2740b57cec5SDimitry Andric // Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
2750b57cec5SDimitry Andric // Invec[2] -  |2|      TransposedMatrix[2] - |2|
2760b57cec5SDimitry Andric 
2770b57cec5SDimitry Andric // For VecElems = 32
2780b57cec5SDimitry Andric // Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
2790b57cec5SDimitry Andric // Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
2800b57cec5SDimitry Andric // Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
2810b57cec5SDimitry Andric 
2820b57cec5SDimitry Andric // For VecElems = 64
2830b57cec5SDimitry Andric // Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
2840b57cec5SDimitry Andric // Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
2850b57cec5SDimitry Andric // Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
2860b57cec5SDimitry Andric 
2870b57cec5SDimitry Andric static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
2885ffd83dbSDimitry Andric                              ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
2890b57cec5SDimitry Andric                              unsigned VecElems, unsigned Stride,
2905ffd83dbSDimitry Andric                              IRBuilder<> &Builder) {
2910b57cec5SDimitry Andric 
2920b57cec5SDimitry Andric   if (VecElems == 16) {
2930b57cec5SDimitry Andric     for (unsigned i = 0; i < Stride; i++)
294e8d8bef9SDimitry Andric       TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
2950b57cec5SDimitry Andric     return;
2960b57cec5SDimitry Andric   }
2970b57cec5SDimitry Andric 
2985ffd83dbSDimitry Andric   SmallVector<int, 32> OptimizeShuf;
2990b57cec5SDimitry Andric   Value *Temp[8];
3000b57cec5SDimitry Andric 
3010b57cec5SDimitry Andric   for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
3020b57cec5SDimitry Andric     genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
3030b57cec5SDimitry Andric                     (i + 1) / Stride * 16);
3040b57cec5SDimitry Andric     Temp[i / 2] = Builder.CreateShuffleVector(
3050b57cec5SDimitry Andric         Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
3060b57cec5SDimitry Andric     OptimizeShuf.clear();
3070b57cec5SDimitry Andric   }
3080b57cec5SDimitry Andric 
3090b57cec5SDimitry Andric   if (VecElems == 32) {
3100b57cec5SDimitry Andric     std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
3110b57cec5SDimitry Andric     return;
3125ffd83dbSDimitry Andric   } else
3130b57cec5SDimitry Andric     for (unsigned i = 0; i < Stride; i++)
3140b57cec5SDimitry Andric       TransposedMatrix[i] =
3150b57cec5SDimitry Andric           Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
3160b57cec5SDimitry Andric }
3170b57cec5SDimitry Andric 
3180b57cec5SDimitry Andric void X86InterleavedAccessGroup::interleave8bitStride4VF8(
3190b57cec5SDimitry Andric     ArrayRef<Instruction *> Matrix,
3200b57cec5SDimitry Andric     SmallVectorImpl<Value *> &TransposedMatrix) {
3210b57cec5SDimitry Andric   // Assuming we start from the following vectors:
3220b57cec5SDimitry Andric   // Matrix[0]= c0 c1 c2 c3 c4 ... c7
3230b57cec5SDimitry Andric   // Matrix[1]= m0 m1 m2 m3 m4 ... m7
3240b57cec5SDimitry Andric   // Matrix[2]= y0 y1 y2 y3 y4 ... y7
3250b57cec5SDimitry Andric   // Matrix[3]= k0 k1 k2 k3 k4 ... k7
3260b57cec5SDimitry Andric 
3270b57cec5SDimitry Andric   MVT VT = MVT::v8i16;
3280b57cec5SDimitry Andric   TransposedMatrix.resize(2);
3295ffd83dbSDimitry Andric   SmallVector<int, 16> MaskLow;
3305ffd83dbSDimitry Andric   SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
3315ffd83dbSDimitry Andric   SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
3320b57cec5SDimitry Andric 
3330b57cec5SDimitry Andric   for (unsigned i = 0; i < 8; ++i) {
3340b57cec5SDimitry Andric     MaskLow.push_back(i);
3350b57cec5SDimitry Andric     MaskLow.push_back(i + 8);
3360b57cec5SDimitry Andric   }
3370b57cec5SDimitry Andric 
3385ffd83dbSDimitry Andric   createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
3395ffd83dbSDimitry Andric   createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
3405ffd83dbSDimitry Andric   narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
3415ffd83dbSDimitry Andric   narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
3420b57cec5SDimitry Andric   // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
3430b57cec5SDimitry Andric   // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
3440b57cec5SDimitry Andric   Value *IntrVec1Low =
3450b57cec5SDimitry Andric       Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
3460b57cec5SDimitry Andric   Value *IntrVec2Low =
3470b57cec5SDimitry Andric       Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
3480b57cec5SDimitry Andric 
3490b57cec5SDimitry Andric   // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
3500b57cec5SDimitry Andric   // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
3510b57cec5SDimitry Andric 
3520b57cec5SDimitry Andric   TransposedMatrix[0] =
3530b57cec5SDimitry Andric       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
3540b57cec5SDimitry Andric   TransposedMatrix[1] =
3550b57cec5SDimitry Andric       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
3560b57cec5SDimitry Andric }
3570b57cec5SDimitry Andric 
3580b57cec5SDimitry Andric void X86InterleavedAccessGroup::interleave8bitStride4(
3590b57cec5SDimitry Andric     ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
3600b57cec5SDimitry Andric     unsigned NumOfElm) {
3610b57cec5SDimitry Andric   // Example: Assuming we start from the following vectors:
3620b57cec5SDimitry Andric   // Matrix[0]= c0 c1 c2 c3 c4 ... c31
3630b57cec5SDimitry Andric   // Matrix[1]= m0 m1 m2 m3 m4 ... m31
3640b57cec5SDimitry Andric   // Matrix[2]= y0 y1 y2 y3 y4 ... y31
3650b57cec5SDimitry Andric   // Matrix[3]= k0 k1 k2 k3 k4 ... k31
3660b57cec5SDimitry Andric 
3670b57cec5SDimitry Andric   MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
3680b57cec5SDimitry Andric   MVT HalfVT = scaleVectorType(VT);
3690b57cec5SDimitry Andric 
3700b57cec5SDimitry Andric   TransposedMatrix.resize(4);
3715ffd83dbSDimitry Andric   SmallVector<int, 32> MaskHigh;
3725ffd83dbSDimitry Andric   SmallVector<int, 32> MaskLow;
3735ffd83dbSDimitry Andric   SmallVector<int, 32> LowHighMask[2];
3745ffd83dbSDimitry Andric   SmallVector<int, 32> MaskHighTemp;
3755ffd83dbSDimitry Andric   SmallVector<int, 32> MaskLowTemp;
3760b57cec5SDimitry Andric 
3770b57cec5SDimitry Andric   // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
3780b57cec5SDimitry Andric   // shuffle pattern.
3790b57cec5SDimitry Andric 
3805ffd83dbSDimitry Andric   createUnpackShuffleMask(VT, MaskLow, true, false);
3815ffd83dbSDimitry Andric   createUnpackShuffleMask(VT, MaskHigh, false, false);
3820b57cec5SDimitry Andric 
3830b57cec5SDimitry Andric   // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
3840b57cec5SDimitry Andric   // shuffle pattern.
3850b57cec5SDimitry Andric 
3865ffd83dbSDimitry Andric   createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
3875ffd83dbSDimitry Andric   createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
3885ffd83dbSDimitry Andric   narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
3895ffd83dbSDimitry Andric   narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
3900b57cec5SDimitry Andric 
3910b57cec5SDimitry Andric   // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
3920b57cec5SDimitry Andric   // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
3930b57cec5SDimitry Andric   // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
3940b57cec5SDimitry Andric   // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
3950b57cec5SDimitry Andric   Value *IntrVec[4];
3960b57cec5SDimitry Andric 
3970b57cec5SDimitry Andric   IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
3980b57cec5SDimitry Andric   IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
3990b57cec5SDimitry Andric   IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
4000b57cec5SDimitry Andric   IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
4010b57cec5SDimitry Andric 
4020b57cec5SDimitry Andric   // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
4030b57cec5SDimitry Andric   // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
4040b57cec5SDimitry Andric   // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
4050b57cec5SDimitry Andric   // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
4060b57cec5SDimitry Andric 
4070b57cec5SDimitry Andric   Value *VecOut[4];
4080b57cec5SDimitry Andric   for (int i = 0; i < 4; i++)
4090b57cec5SDimitry Andric     VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
4100b57cec5SDimitry Andric                                             LowHighMask[i % 2]);
4110b57cec5SDimitry Andric 
4120b57cec5SDimitry Andric   // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
4130b57cec5SDimitry Andric   // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
4140b57cec5SDimitry Andric   // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
4150b57cec5SDimitry Andric   // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
4160b57cec5SDimitry Andric 
4170b57cec5SDimitry Andric   if (VT == MVT::v16i8) {
4180b57cec5SDimitry Andric     std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
4190b57cec5SDimitry Andric     return;
4200b57cec5SDimitry Andric   }
4210b57cec5SDimitry Andric 
422bdd1243dSDimitry Andric   reorderSubVector(VT, TransposedMatrix, VecOut, ArrayRef(Concat, 16), NumOfElm,
423bdd1243dSDimitry Andric                    4, Builder);
4240b57cec5SDimitry Andric }
4250b57cec5SDimitry Andric 
4260b57cec5SDimitry Andric //  createShuffleStride returns shuffle mask of size N.
4270b57cec5SDimitry Andric //  The shuffle pattern is as following :
4280b57cec5SDimitry Andric //  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
4290b57cec5SDimitry Andric //  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
4300b57cec5SDimitry Andric //  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
4310b57cec5SDimitry Andric //  Where Lane is the # of lanes in a register:
4320b57cec5SDimitry Andric //  VectorSize = 128 => Lane = 1
4330b57cec5SDimitry Andric //  VectorSize = 256 => Lane = 2
4340b57cec5SDimitry Andric //  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
4350b57cec5SDimitry Andric //  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
4360b57cec5SDimitry Andric static void createShuffleStride(MVT VT, int Stride,
4375ffd83dbSDimitry Andric                                 SmallVectorImpl<int> &Mask) {
4380b57cec5SDimitry Andric   int VectorSize = VT.getSizeInBits();
4390b57cec5SDimitry Andric   int VF = VT.getVectorNumElements();
4400b57cec5SDimitry Andric   int LaneCount = std::max(VectorSize / 128, 1);
4410b57cec5SDimitry Andric   for (int Lane = 0; Lane < LaneCount; Lane++)
4420b57cec5SDimitry Andric     for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
4430b57cec5SDimitry Andric       Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
4440b57cec5SDimitry Andric }
4450b57cec5SDimitry Andric 
4460b57cec5SDimitry Andric //  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
4470b57cec5SDimitry Andric //  inside mask a shuffleMask. A mask contains exactly 3 groups, where
4480b57cec5SDimitry Andric //  each group is a monotonically increasing sequence with stride 3.
4490b57cec5SDimitry Andric //  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
4505ffd83dbSDimitry Andric static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
4510b57cec5SDimitry Andric   int VectorSize = VT.getSizeInBits();
4520b57cec5SDimitry Andric   int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
4530b57cec5SDimitry Andric   for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
4540b57cec5SDimitry Andric     int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
4550b57cec5SDimitry Andric     SizeInfo.push_back(GroupSize);
4560b57cec5SDimitry Andric     FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
4570b57cec5SDimitry Andric   }
4580b57cec5SDimitry Andric }
4590b57cec5SDimitry Andric 
4600b57cec5SDimitry Andric //  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
4610b57cec5SDimitry Andric //  vpalign works according to lanes
4620b57cec5SDimitry Andric //  Where Lane is the # of lanes in a register:
4630b57cec5SDimitry Andric //  VectorWide = 128 => Lane = 1
4640b57cec5SDimitry Andric //  VectorWide = 256 => Lane = 2
4650b57cec5SDimitry Andric //  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
4660b57cec5SDimitry Andric //  For Lane = 2 shuffle pattern is:
4670b57cec5SDimitry Andric //  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
4680b57cec5SDimitry Andric //  Imm variable sets the offset amount. The result of the
4690b57cec5SDimitry Andric //  function is stored inside ShuffleMask vector and it built as described in
4700b57cec5SDimitry Andric //  the begin of the description. AlignDirection is a boolean that indicates the
4710b57cec5SDimitry Andric //  direction of the alignment. (false - align to the "right" side while true -
4720b57cec5SDimitry Andric //  align to the "left" side)
4730b57cec5SDimitry Andric static void DecodePALIGNRMask(MVT VT, unsigned Imm,
4745ffd83dbSDimitry Andric                               SmallVectorImpl<int> &ShuffleMask,
4750b57cec5SDimitry Andric                               bool AlignDirection = true, bool Unary = false) {
4760b57cec5SDimitry Andric   unsigned NumElts = VT.getVectorNumElements();
4770b57cec5SDimitry Andric   unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
4780b57cec5SDimitry Andric   unsigned NumLaneElts = NumElts / NumLanes;
4790b57cec5SDimitry Andric 
4800b57cec5SDimitry Andric   Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
4810b57cec5SDimitry Andric   unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
4820b57cec5SDimitry Andric 
4830b57cec5SDimitry Andric   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4840b57cec5SDimitry Andric     for (unsigned i = 0; i != NumLaneElts; ++i) {
4850b57cec5SDimitry Andric       unsigned Base = i + Offset;
4860b57cec5SDimitry Andric       // if i+offset is out of this lane then we actually need the other source
4870b57cec5SDimitry Andric       // If Unary the other source is the first source.
4880b57cec5SDimitry Andric       if (Base >= NumLaneElts)
4890b57cec5SDimitry Andric         Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
4900b57cec5SDimitry Andric       ShuffleMask.push_back(Base + l);
4910b57cec5SDimitry Andric     }
4920b57cec5SDimitry Andric   }
4930b57cec5SDimitry Andric }
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric // concatSubVector - The function rebuilds the data to a correct expected
4960b57cec5SDimitry Andric // order. An assumption(The shape of the matrix) was taken for the
4970b57cec5SDimitry Andric // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
4980b57cec5SDimitry Andric // This function ensures that the data is built in correct way for the lane
4990b57cec5SDimitry Andric // instructions. Each lane inside the vector is a 128-bit length.
5000b57cec5SDimitry Andric //
5010b57cec5SDimitry Andric // The 'InVec' argument contains the data in increasing order. In InVec[0] You
5020b57cec5SDimitry Andric // can find the first 128 bit data. The number of different lanes inside a
5030b57cec5SDimitry Andric // vector depends on the 'VecElems'.In general, the formula is
5040b57cec5SDimitry Andric // VecElems * type / 128. The size of the array 'InVec' depends and equal to
5050b57cec5SDimitry Andric // 'VecElems'.
5060b57cec5SDimitry Andric 
5070b57cec5SDimitry Andric // For VecElems = 16
5080b57cec5SDimitry Andric // Invec[0] - |0|      Vec[0] - |0|
5090b57cec5SDimitry Andric // Invec[1] - |1|  =>  Vec[1] - |1|
5100b57cec5SDimitry Andric // Invec[2] - |2|      Vec[2] - |2|
5110b57cec5SDimitry Andric 
5120b57cec5SDimitry Andric // For VecElems = 32
5130b57cec5SDimitry Andric // Invec[0] - |0|1|      Vec[0] - |0|3|
5140b57cec5SDimitry Andric // Invec[1] - |2|3|  =>  Vec[1] - |1|4|
5150b57cec5SDimitry Andric // Invec[2] - |4|5|      Vec[2] - |2|5|
5160b57cec5SDimitry Andric 
5170b57cec5SDimitry Andric // For VecElems = 64
5180b57cec5SDimitry Andric // Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
5190b57cec5SDimitry Andric // Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
5200b57cec5SDimitry Andric // Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
5210b57cec5SDimitry Andric 
5220b57cec5SDimitry Andric static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
5235ffd83dbSDimitry Andric                             unsigned VecElems, IRBuilder<> &Builder) {
5240b57cec5SDimitry Andric   if (VecElems == 16) {
5250b57cec5SDimitry Andric     for (int i = 0; i < 3; i++)
5260b57cec5SDimitry Andric       Vec[i] = InVec[i];
5270b57cec5SDimitry Andric     return;
5280b57cec5SDimitry Andric   }
5290b57cec5SDimitry Andric 
5300b57cec5SDimitry Andric   for (unsigned j = 0; j < VecElems / 32; j++)
5310b57cec5SDimitry Andric     for (int i = 0; i < 3; i++)
5320b57cec5SDimitry Andric       Vec[i + j * 3] = Builder.CreateShuffleVector(
533bdd1243dSDimitry Andric           InVec[j * 6 + i], InVec[j * 6 + i + 3], ArrayRef(Concat, 32));
5340b57cec5SDimitry Andric 
5350b57cec5SDimitry Andric   if (VecElems == 32)
5360b57cec5SDimitry Andric     return;
5370b57cec5SDimitry Andric 
5380b57cec5SDimitry Andric   for (int i = 0; i < 3; i++)
5390b57cec5SDimitry Andric     Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
5400b57cec5SDimitry Andric }
5410b57cec5SDimitry Andric 
5420b57cec5SDimitry Andric void X86InterleavedAccessGroup::deinterleave8bitStride3(
5430b57cec5SDimitry Andric     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
5440b57cec5SDimitry Andric     unsigned VecElems) {
5450b57cec5SDimitry Andric   // Example: Assuming we start from the following vectors:
5460b57cec5SDimitry Andric   // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
5470b57cec5SDimitry Andric   // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
5480b57cec5SDimitry Andric   // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
5490b57cec5SDimitry Andric 
5500b57cec5SDimitry Andric   TransposedMatrix.resize(3);
5515ffd83dbSDimitry Andric   SmallVector<int, 32> VPShuf;
5525ffd83dbSDimitry Andric   SmallVector<int, 32> VPAlign[2];
5535ffd83dbSDimitry Andric   SmallVector<int, 32> VPAlign2;
5545ffd83dbSDimitry Andric   SmallVector<int, 32> VPAlign3;
5555ffd83dbSDimitry Andric   SmallVector<int, 3> GroupSize;
5560b57cec5SDimitry Andric   Value *Vec[6], *TempVector[3];
5570b57cec5SDimitry Andric 
5580b57cec5SDimitry Andric   MVT VT = MVT::getVT(Shuffles[0]->getType());
5590b57cec5SDimitry Andric 
5600b57cec5SDimitry Andric   createShuffleStride(VT, 3, VPShuf);
5610b57cec5SDimitry Andric   setGroupSize(VT, GroupSize);
5620b57cec5SDimitry Andric 
5630b57cec5SDimitry Andric   for (int i = 0; i < 2; i++)
5640b57cec5SDimitry Andric     DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
5650b57cec5SDimitry Andric 
5660b57cec5SDimitry Andric   DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
5670b57cec5SDimitry Andric   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
5680b57cec5SDimitry Andric 
5690b57cec5SDimitry Andric   concatSubVector(Vec, InVec, VecElems, Builder);
5700b57cec5SDimitry Andric   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
5710b57cec5SDimitry Andric   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
5720b57cec5SDimitry Andric   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
5730b57cec5SDimitry Andric 
5740b57cec5SDimitry Andric   for (int i = 0; i < 3; i++)
575e8d8bef9SDimitry Andric     Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
5760b57cec5SDimitry Andric 
5770b57cec5SDimitry Andric   // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
5780b57cec5SDimitry Andric   // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
5790b57cec5SDimitry Andric   // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
5800b57cec5SDimitry Andric 
5810b57cec5SDimitry Andric   for (int i = 0; i < 3; i++)
5820b57cec5SDimitry Andric     TempVector[i] =
5830b57cec5SDimitry Andric         Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
5840b57cec5SDimitry Andric 
5850b57cec5SDimitry Andric   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
5860b57cec5SDimitry Andric   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
5870b57cec5SDimitry Andric   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
5880b57cec5SDimitry Andric 
5890b57cec5SDimitry Andric   for (int i = 0; i < 3; i++)
5900b57cec5SDimitry Andric     Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
5910b57cec5SDimitry Andric                                          VPAlign[1]);
5920b57cec5SDimitry Andric 
5930b57cec5SDimitry Andric   // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
5940b57cec5SDimitry Andric   // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
5950b57cec5SDimitry Andric   // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
5960b57cec5SDimitry Andric 
597e8d8bef9SDimitry Andric   Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
598e8d8bef9SDimitry Andric   TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
5990b57cec5SDimitry Andric   TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
6000b57cec5SDimitry Andric   TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
6010b57cec5SDimitry Andric }
6020b57cec5SDimitry Andric 
6030b57cec5SDimitry Andric // group2Shuffle reorder the shuffle stride back into continuous order.
6040b57cec5SDimitry Andric // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
6050b57cec5SDimitry Andric // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
6065ffd83dbSDimitry Andric static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
6075ffd83dbSDimitry Andric                           SmallVectorImpl<int> &Output) {
6080b57cec5SDimitry Andric   int IndexGroup[3] = {0, 0, 0};
6090b57cec5SDimitry Andric   int Index = 0;
6100b57cec5SDimitry Andric   int VectorWidth = VT.getSizeInBits();
6110b57cec5SDimitry Andric   int VF = VT.getVectorNumElements();
6120b57cec5SDimitry Andric   // Find the index of the different groups.
6130b57cec5SDimitry Andric   int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
6140b57cec5SDimitry Andric   for (int i = 0; i < 3; i++) {
6150b57cec5SDimitry Andric     IndexGroup[(Index * 3) % (VF / Lane)] = Index;
6160b57cec5SDimitry Andric     Index += Mask[i];
6170b57cec5SDimitry Andric   }
6180b57cec5SDimitry Andric   // According to the index compute the convert mask.
6190b57cec5SDimitry Andric   for (int i = 0; i < VF / Lane; i++) {
6200b57cec5SDimitry Andric     Output.push_back(IndexGroup[i % 3]);
6210b57cec5SDimitry Andric     IndexGroup[i % 3]++;
6220b57cec5SDimitry Andric   }
6230b57cec5SDimitry Andric }
6240b57cec5SDimitry Andric 
6250b57cec5SDimitry Andric void X86InterleavedAccessGroup::interleave8bitStride3(
6260b57cec5SDimitry Andric     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
6270b57cec5SDimitry Andric     unsigned VecElems) {
6280b57cec5SDimitry Andric   // Example: Assuming we start from the following vectors:
6290b57cec5SDimitry Andric   // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
6300b57cec5SDimitry Andric   // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
6310b57cec5SDimitry Andric   // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
6320b57cec5SDimitry Andric 
6330b57cec5SDimitry Andric   TransposedMatrix.resize(3);
6345ffd83dbSDimitry Andric   SmallVector<int, 3> GroupSize;
6355ffd83dbSDimitry Andric   SmallVector<int, 32> VPShuf;
6365ffd83dbSDimitry Andric   SmallVector<int, 32> VPAlign[3];
6375ffd83dbSDimitry Andric   SmallVector<int, 32> VPAlign2;
6385ffd83dbSDimitry Andric   SmallVector<int, 32> VPAlign3;
6390b57cec5SDimitry Andric 
6400b57cec5SDimitry Andric   Value *Vec[3], *TempVector[3];
6410b57cec5SDimitry Andric   MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
6420b57cec5SDimitry Andric 
6430b57cec5SDimitry Andric   setGroupSize(VT, GroupSize);
6440b57cec5SDimitry Andric 
6450b57cec5SDimitry Andric   for (int i = 0; i < 3; i++)
6460b57cec5SDimitry Andric     DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
6470b57cec5SDimitry Andric 
6480b57cec5SDimitry Andric   DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
6490b57cec5SDimitry Andric   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
6500b57cec5SDimitry Andric 
6510b57cec5SDimitry Andric   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
6520b57cec5SDimitry Andric   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
6530b57cec5SDimitry Andric   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
6540b57cec5SDimitry Andric 
655e8d8bef9SDimitry Andric   Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
656e8d8bef9SDimitry Andric   Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
6570b57cec5SDimitry Andric   Vec[2] = InVec[2];
6580b57cec5SDimitry Andric 
6590b57cec5SDimitry Andric   // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
6600b57cec5SDimitry Andric   // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
6610b57cec5SDimitry Andric   // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
6620b57cec5SDimitry Andric 
6630b57cec5SDimitry Andric   for (int i = 0; i < 3; i++)
6640b57cec5SDimitry Andric     TempVector[i] =
6650b57cec5SDimitry Andric         Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
6660b57cec5SDimitry Andric 
6670b57cec5SDimitry Andric   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
6680b57cec5SDimitry Andric   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
6690b57cec5SDimitry Andric   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
6700b57cec5SDimitry Andric 
6710b57cec5SDimitry Andric   for (int i = 0; i < 3; i++)
6720b57cec5SDimitry Andric     Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
6730b57cec5SDimitry Andric                                          VPAlign[2]);
6740b57cec5SDimitry Andric 
6750b57cec5SDimitry Andric   // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
6760b57cec5SDimitry Andric   // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
6770b57cec5SDimitry Andric   // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
6780b57cec5SDimitry Andric 
6790b57cec5SDimitry Andric   unsigned NumOfElm = VT.getVectorNumElements();
6800b57cec5SDimitry Andric   group2Shuffle(VT, GroupSize, VPShuf);
6810b57cec5SDimitry Andric   reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
6820b57cec5SDimitry Andric }
6830b57cec5SDimitry Andric 
6840b57cec5SDimitry Andric void X86InterleavedAccessGroup::transpose_4x4(
6850b57cec5SDimitry Andric     ArrayRef<Instruction *> Matrix,
6860b57cec5SDimitry Andric     SmallVectorImpl<Value *> &TransposedMatrix) {
6870b57cec5SDimitry Andric   assert(Matrix.size() == 4 && "Invalid matrix size");
6880b57cec5SDimitry Andric   TransposedMatrix.resize(4);
6890b57cec5SDimitry Andric 
6900b57cec5SDimitry Andric   // dst = src1[0,1],src2[0,1]
6915ffd83dbSDimitry Andric   static constexpr int IntMask1[] = {0, 1, 4, 5};
692bdd1243dSDimitry Andric   ArrayRef<int> Mask = ArrayRef(IntMask1, 4);
6930b57cec5SDimitry Andric   Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
6940b57cec5SDimitry Andric   Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
6950b57cec5SDimitry Andric 
6960b57cec5SDimitry Andric   // dst = src1[2,3],src2[2,3]
6975ffd83dbSDimitry Andric   static constexpr int IntMask2[] = {2, 3, 6, 7};
698bdd1243dSDimitry Andric   Mask = ArrayRef(IntMask2, 4);
6990b57cec5SDimitry Andric   Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
7000b57cec5SDimitry Andric   Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
7010b57cec5SDimitry Andric 
7020b57cec5SDimitry Andric   // dst = src1[0],src2[0],src1[2],src2[2]
7035ffd83dbSDimitry Andric   static constexpr int IntMask3[] = {0, 4, 2, 6};
704bdd1243dSDimitry Andric   Mask = ArrayRef(IntMask3, 4);
7050b57cec5SDimitry Andric   TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
7060b57cec5SDimitry Andric   TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
7070b57cec5SDimitry Andric 
7080b57cec5SDimitry Andric   // dst = src1[1],src2[1],src1[3],src2[3]
7095ffd83dbSDimitry Andric   static constexpr int IntMask4[] = {1, 5, 3, 7};
710bdd1243dSDimitry Andric   Mask = ArrayRef(IntMask4, 4);
7110b57cec5SDimitry Andric   TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
7120b57cec5SDimitry Andric   TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
7130b57cec5SDimitry Andric }
7140b57cec5SDimitry Andric 
7150b57cec5SDimitry Andric // Lowers this interleaved access group into X86-specific
7160b57cec5SDimitry Andric // instructions/intrinsics.
7170b57cec5SDimitry Andric bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
7180b57cec5SDimitry Andric   SmallVector<Instruction *, 4> DecomposedVectors;
7190b57cec5SDimitry Andric   SmallVector<Value *, 4> TransposedVectors;
7205ffd83dbSDimitry Andric   auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
7210b57cec5SDimitry Andric 
7220b57cec5SDimitry Andric   if (isa<LoadInst>(Inst)) {
7235ffd83dbSDimitry Andric     auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
7245ffd83dbSDimitry Andric     unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
7250b57cec5SDimitry Andric     switch (NumSubVecElems) {
7260b57cec5SDimitry Andric     default:
7270b57cec5SDimitry Andric       return false;
7280b57cec5SDimitry Andric     case 4:
7290b57cec5SDimitry Andric     case 8:
7300b57cec5SDimitry Andric     case 16:
7310b57cec5SDimitry Andric     case 32:
7320b57cec5SDimitry Andric     case 64:
733fe6060f1SDimitry Andric       if (ShuffleTy->getNumElements() != NumSubVecElems)
734fe6060f1SDimitry Andric         return false;
7350b57cec5SDimitry Andric       break;
7360b57cec5SDimitry Andric     }
7370b57cec5SDimitry Andric 
738fe6060f1SDimitry Andric     // Try to generate target-sized register(/instruction).
739fe6060f1SDimitry Andric     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
740fe6060f1SDimitry Andric 
741fe6060f1SDimitry Andric     // Perform matrix-transposition in order to compute interleaved
742fe6060f1SDimitry Andric     // results by generating some sort of (optimized) target-specific
743fe6060f1SDimitry Andric     // instructions.
744fe6060f1SDimitry Andric 
745fe6060f1SDimitry Andric     if (NumSubVecElems == 4)
746fe6060f1SDimitry Andric       transpose_4x4(DecomposedVectors, TransposedVectors);
747fe6060f1SDimitry Andric     else
748fe6060f1SDimitry Andric       deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
749fe6060f1SDimitry Andric                               NumSubVecElems);
750fe6060f1SDimitry Andric 
7510b57cec5SDimitry Andric     // Now replace the unoptimized-interleaved-vectors with the
7520b57cec5SDimitry Andric     // transposed-interleaved vectors.
7530b57cec5SDimitry Andric     for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
7540b57cec5SDimitry Andric       Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
7550b57cec5SDimitry Andric 
7560b57cec5SDimitry Andric     return true;
7570b57cec5SDimitry Andric   }
7580b57cec5SDimitry Andric 
7595ffd83dbSDimitry Andric   Type *ShuffleEltTy = ShuffleTy->getElementType();
7605ffd83dbSDimitry Andric   unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
7610b57cec5SDimitry Andric 
7620b57cec5SDimitry Andric   // Lower the interleaved stores:
7630b57cec5SDimitry Andric   //   1. Decompose the interleaved wide shuffle into individual shuffle
7640b57cec5SDimitry Andric   //   vectors.
7655ffd83dbSDimitry Andric   decompose(Shuffles[0], Factor,
7665ffd83dbSDimitry Andric             FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
7670b57cec5SDimitry Andric             DecomposedVectors);
7680b57cec5SDimitry Andric 
7690b57cec5SDimitry Andric   //   2. Transpose the interleaved-vectors into vectors of contiguous
7700b57cec5SDimitry Andric   //      elements.
7710b57cec5SDimitry Andric   switch (NumSubVecElems) {
7720b57cec5SDimitry Andric   case 4:
7730b57cec5SDimitry Andric     transpose_4x4(DecomposedVectors, TransposedVectors);
7740b57cec5SDimitry Andric     break;
7750b57cec5SDimitry Andric   case 8:
7760b57cec5SDimitry Andric     interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
7770b57cec5SDimitry Andric     break;
7780b57cec5SDimitry Andric   case 16:
7790b57cec5SDimitry Andric   case 32:
7800b57cec5SDimitry Andric   case 64:
7810b57cec5SDimitry Andric     if (Factor == 4)
7820b57cec5SDimitry Andric       interleave8bitStride4(DecomposedVectors, TransposedVectors,
7830b57cec5SDimitry Andric                             NumSubVecElems);
7840b57cec5SDimitry Andric     if (Factor == 3)
7850b57cec5SDimitry Andric       interleave8bitStride3(DecomposedVectors, TransposedVectors,
7860b57cec5SDimitry Andric                             NumSubVecElems);
7870b57cec5SDimitry Andric     break;
7880b57cec5SDimitry Andric   default:
7890b57cec5SDimitry Andric     return false;
7900b57cec5SDimitry Andric   }
7910b57cec5SDimitry Andric 
7920b57cec5SDimitry Andric   //   3. Concatenate the contiguous-vectors back into a wide vector.
7930b57cec5SDimitry Andric   Value *WideVec = concatenateVectors(Builder, TransposedVectors);
7940b57cec5SDimitry Andric 
7950b57cec5SDimitry Andric   //   4. Generate a store instruction for wide-vec.
7960b57cec5SDimitry Andric   StoreInst *SI = cast<StoreInst>(Inst);
7975ffd83dbSDimitry Andric   Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
7980b57cec5SDimitry Andric 
7990b57cec5SDimitry Andric   return true;
8000b57cec5SDimitry Andric }
8010b57cec5SDimitry Andric 
8020b57cec5SDimitry Andric // Lower interleaved load(s) into target specific instructions/
8030b57cec5SDimitry Andric // intrinsics. Lowering sequence varies depending on the vector-types, factor,
8040b57cec5SDimitry Andric // number of shuffles and ISA.
8050b57cec5SDimitry Andric // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
8060b57cec5SDimitry Andric bool X86TargetLowering::lowerInterleavedLoad(
8070b57cec5SDimitry Andric     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
8080b57cec5SDimitry Andric     ArrayRef<unsigned> Indices, unsigned Factor) const {
8090b57cec5SDimitry Andric   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
8100b57cec5SDimitry Andric          "Invalid interleave factor");
8110b57cec5SDimitry Andric   assert(!Shuffles.empty() && "Empty shufflevector input");
8120b57cec5SDimitry Andric   assert(Shuffles.size() == Indices.size() &&
8130b57cec5SDimitry Andric          "Unmatched number of shufflevectors and indices");
8140b57cec5SDimitry Andric 
8150b57cec5SDimitry Andric   // Create an interleaved access group.
8160b57cec5SDimitry Andric   IRBuilder<> Builder(LI);
8170b57cec5SDimitry Andric   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
8180b57cec5SDimitry Andric                                 Builder);
8190b57cec5SDimitry Andric 
8200b57cec5SDimitry Andric   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
8210b57cec5SDimitry Andric }
8220b57cec5SDimitry Andric 
8230b57cec5SDimitry Andric bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
8240b57cec5SDimitry Andric                                               ShuffleVectorInst *SVI,
8250b57cec5SDimitry Andric                                               unsigned Factor) const {
8260b57cec5SDimitry Andric   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
8270b57cec5SDimitry Andric          "Invalid interleave factor");
8280b57cec5SDimitry Andric 
8295ffd83dbSDimitry Andric   assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
8305ffd83dbSDimitry Andric              0 &&
8310b57cec5SDimitry Andric          "Invalid interleaved store");
8320b57cec5SDimitry Andric 
8330b57cec5SDimitry Andric   // Holds the indices of SVI that correspond to the starting index of each
8340b57cec5SDimitry Andric   // interleaved shuffle.
8350b57cec5SDimitry Andric   SmallVector<unsigned, 4> Indices;
8360b57cec5SDimitry Andric   auto Mask = SVI->getShuffleMask();
8370b57cec5SDimitry Andric   for (unsigned i = 0; i < Factor; i++)
8380b57cec5SDimitry Andric     Indices.push_back(Mask[i]);
8390b57cec5SDimitry Andric 
840bdd1243dSDimitry Andric   ArrayRef<ShuffleVectorInst *> Shuffles = ArrayRef(SVI);
8410b57cec5SDimitry Andric 
8420b57cec5SDimitry Andric   // Create an interleaved access group.
8430b57cec5SDimitry Andric   IRBuilder<> Builder(SI);
8440b57cec5SDimitry Andric   X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
8450b57cec5SDimitry Andric                                 Builder);
8460b57cec5SDimitry Andric 
8470b57cec5SDimitry Andric   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
8480b57cec5SDimitry Andric }
849