10b57cec5SDimitry Andric //===- X86InterleavedAccess.cpp -------------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This file contains the X86 implementation of the interleaved accesses 110b57cec5SDimitry Andric /// optimization generating X86-specific instructions/intrinsics for 120b57cec5SDimitry Andric /// interleaved access groups. 130b57cec5SDimitry Andric // 140b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 150b57cec5SDimitry Andric 160b57cec5SDimitry Andric #include "X86ISelLowering.h" 170b57cec5SDimitry Andric #include "X86Subtarget.h" 180b57cec5SDimitry Andric #include "llvm/ADT/ArrayRef.h" 190b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 200b57cec5SDimitry Andric #include "llvm/Analysis/VectorUtils.h" 21*0fca6ea1SDimitry Andric #include "llvm/CodeGenTypes/MachineValueType.h" 220b57cec5SDimitry Andric #include "llvm/IR/Constants.h" 230b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h" 240b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h" 250b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h" 260b57cec5SDimitry Andric #include "llvm/IR/Instruction.h" 270b57cec5SDimitry Andric #include "llvm/IR/Instructions.h" 280b57cec5SDimitry Andric #include "llvm/IR/Module.h" 290b57cec5SDimitry Andric #include "llvm/IR/Type.h" 300b57cec5SDimitry Andric #include "llvm/IR/Value.h" 310b57cec5SDimitry Andric #include "llvm/Support/Casting.h" 320b57cec5SDimitry Andric #include <algorithm> 330b57cec5SDimitry Andric #include <cassert> 340b57cec5SDimitry Andric #include <cmath> 350b57cec5SDimitry Andric #include <cstdint> 360b57cec5SDimitry Andric 370b57cec5SDimitry Andric using namespace llvm; 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric namespace { 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric /// This class holds necessary information to represent an interleaved 420b57cec5SDimitry Andric /// access group and supports utilities to lower the group into 430b57cec5SDimitry Andric /// X86-specific instructions/intrinsics. 440b57cec5SDimitry Andric /// E.g. A group of interleaving access loads (Factor = 2; accessing every 450b57cec5SDimitry Andric /// other element) 460b57cec5SDimitry Andric /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 47e8d8bef9SDimitry Andric /// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> 48e8d8bef9SDimitry Andric /// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> 490b57cec5SDimitry Andric class X86InterleavedAccessGroup { 500b57cec5SDimitry Andric /// Reference to the wide-load instruction of an interleaved access 510b57cec5SDimitry Andric /// group. 520b57cec5SDimitry Andric Instruction *const Inst; 530b57cec5SDimitry Andric 540b57cec5SDimitry Andric /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'. 550b57cec5SDimitry Andric ArrayRef<ShuffleVectorInst *> Shuffles; 560b57cec5SDimitry Andric 570b57cec5SDimitry Andric /// Reference to the starting index of each user-shuffle. 580b57cec5SDimitry Andric ArrayRef<unsigned> Indices; 590b57cec5SDimitry Andric 600b57cec5SDimitry Andric /// Reference to the interleaving stride in terms of elements. 610b57cec5SDimitry Andric const unsigned Factor; 620b57cec5SDimitry Andric 630b57cec5SDimitry Andric /// Reference to the underlying target. 640b57cec5SDimitry Andric const X86Subtarget &Subtarget; 650b57cec5SDimitry Andric 660b57cec5SDimitry Andric const DataLayout &DL; 670b57cec5SDimitry Andric 680b57cec5SDimitry Andric IRBuilder<> &Builder; 690b57cec5SDimitry Andric 700b57cec5SDimitry Andric /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors 710b57cec5SDimitry Andric /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors. 725ffd83dbSDimitry Andric void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T, 730b57cec5SDimitry Andric SmallVectorImpl<Instruction *> &DecomposedVectors); 740b57cec5SDimitry Andric 750b57cec5SDimitry Andric /// Performs matrix transposition on a 4x4 matrix \p InputVectors and 760b57cec5SDimitry Andric /// returns the transposed-vectors in \p TransposedVectors. 770b57cec5SDimitry Andric /// E.g. 780b57cec5SDimitry Andric /// InputVectors: 790b57cec5SDimitry Andric /// In-V0 = p1, p2, p3, p4 800b57cec5SDimitry Andric /// In-V1 = q1, q2, q3, q4 810b57cec5SDimitry Andric /// In-V2 = r1, r2, r3, r4 820b57cec5SDimitry Andric /// In-V3 = s1, s2, s3, s4 830b57cec5SDimitry Andric /// OutputVectors: 840b57cec5SDimitry Andric /// Out-V0 = p1, q1, r1, s1 850b57cec5SDimitry Andric /// Out-V1 = p2, q2, r2, s2 860b57cec5SDimitry Andric /// Out-V2 = p3, q3, r3, s3 870b57cec5SDimitry Andric /// Out-V3 = P4, q4, r4, s4 880b57cec5SDimitry Andric void transpose_4x4(ArrayRef<Instruction *> InputVectors, 890b57cec5SDimitry Andric SmallVectorImpl<Value *> &TransposedMatrix); 900b57cec5SDimitry Andric void interleave8bitStride4(ArrayRef<Instruction *> InputVectors, 910b57cec5SDimitry Andric SmallVectorImpl<Value *> &TransposedMatrix, 920b57cec5SDimitry Andric unsigned NumSubVecElems); 930b57cec5SDimitry Andric void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors, 940b57cec5SDimitry Andric SmallVectorImpl<Value *> &TransposedMatrix); 950b57cec5SDimitry Andric void interleave8bitStride3(ArrayRef<Instruction *> InputVectors, 960b57cec5SDimitry Andric SmallVectorImpl<Value *> &TransposedMatrix, 970b57cec5SDimitry Andric unsigned NumSubVecElems); 980b57cec5SDimitry Andric void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors, 990b57cec5SDimitry Andric SmallVectorImpl<Value *> &TransposedMatrix, 1000b57cec5SDimitry Andric unsigned NumSubVecElems); 1010b57cec5SDimitry Andric 1020b57cec5SDimitry Andric public: 1030b57cec5SDimitry Andric /// In order to form an interleaved access group X86InterleavedAccessGroup 1040b57cec5SDimitry Andric /// requires a wide-load instruction \p 'I', a group of interleaved-vectors 1050b57cec5SDimitry Andric /// \p Shuffs, reference to the first indices of each interleaved-vector 1060b57cec5SDimitry Andric /// \p 'Ind' and the interleaving stride factor \p F. In order to generate 1070b57cec5SDimitry Andric /// X86-specific instructions/intrinsics it also requires the underlying 1080b57cec5SDimitry Andric /// target information \p STarget. 1090b57cec5SDimitry Andric explicit X86InterleavedAccessGroup(Instruction *I, 1100b57cec5SDimitry Andric ArrayRef<ShuffleVectorInst *> Shuffs, 1110b57cec5SDimitry Andric ArrayRef<unsigned> Ind, const unsigned F, 1120b57cec5SDimitry Andric const X86Subtarget &STarget, 1130b57cec5SDimitry Andric IRBuilder<> &B) 1140b57cec5SDimitry Andric : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget), 115*0fca6ea1SDimitry Andric DL(Inst->getDataLayout()), Builder(B) {} 1160b57cec5SDimitry Andric 1170b57cec5SDimitry Andric /// Returns true if this interleaved access group can be lowered into 1180b57cec5SDimitry Andric /// x86-specific instructions/intrinsics, false otherwise. 1190b57cec5SDimitry Andric bool isSupported() const; 1200b57cec5SDimitry Andric 1210b57cec5SDimitry Andric /// Lowers this interleaved access group into X86-specific 1220b57cec5SDimitry Andric /// instructions/intrinsics. 1230b57cec5SDimitry Andric bool lowerIntoOptimizedSequence(); 1240b57cec5SDimitry Andric }; 1250b57cec5SDimitry Andric 1260b57cec5SDimitry Andric } // end anonymous namespace 1270b57cec5SDimitry Andric 1280b57cec5SDimitry Andric bool X86InterleavedAccessGroup::isSupported() const { 1290b57cec5SDimitry Andric VectorType *ShuffleVecTy = Shuffles[0]->getType(); 1305ffd83dbSDimitry Andric Type *ShuffleEltTy = ShuffleVecTy->getElementType(); 1310b57cec5SDimitry Andric unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); 1320b57cec5SDimitry Andric unsigned WideInstSize; 1330b57cec5SDimitry Andric 1340b57cec5SDimitry Andric // Currently, lowering is supported for the following vectors: 1350b57cec5SDimitry Andric // Stride 4: 1360b57cec5SDimitry Andric // 1. Store and load of 4-element vectors of 64 bits on AVX. 1370b57cec5SDimitry Andric // 2. Store of 16/32-element vectors of 8 bits on AVX. 1380b57cec5SDimitry Andric // Stride 3: 1390b57cec5SDimitry Andric // 1. Load of 16/32-element vectors of 8 bits on AVX. 1400b57cec5SDimitry Andric if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3)) 1410b57cec5SDimitry Andric return false; 1420b57cec5SDimitry Andric 1430b57cec5SDimitry Andric if (isa<LoadInst>(Inst)) { 1440b57cec5SDimitry Andric WideInstSize = DL.getTypeSizeInBits(Inst->getType()); 1450b57cec5SDimitry Andric if (cast<LoadInst>(Inst)->getPointerAddressSpace()) 1460b57cec5SDimitry Andric return false; 1470b57cec5SDimitry Andric } else 1480b57cec5SDimitry Andric WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); 1490b57cec5SDimitry Andric 1500b57cec5SDimitry Andric // We support shuffle represents stride 4 for byte type with size of 1510b57cec5SDimitry Andric // WideInstSize. 1520b57cec5SDimitry Andric if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) 1530b57cec5SDimitry Andric return true; 1540b57cec5SDimitry Andric 1550b57cec5SDimitry Andric if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 && 1560b57cec5SDimitry Andric (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 || 1570b57cec5SDimitry Andric WideInstSize == 2048)) 1580b57cec5SDimitry Andric return true; 1590b57cec5SDimitry Andric 1600b57cec5SDimitry Andric if (ShuffleElemSize == 8 && Factor == 3 && 1610b57cec5SDimitry Andric (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536)) 1620b57cec5SDimitry Andric return true; 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric return false; 1650b57cec5SDimitry Andric } 1660b57cec5SDimitry Andric 1670b57cec5SDimitry Andric void X86InterleavedAccessGroup::decompose( 1685ffd83dbSDimitry Andric Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy, 1690b57cec5SDimitry Andric SmallVectorImpl<Instruction *> &DecomposedVectors) { 1700b57cec5SDimitry Andric assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) && 1710b57cec5SDimitry Andric "Expected Load or Shuffle"); 1720b57cec5SDimitry Andric 1730b57cec5SDimitry Andric Type *VecWidth = VecInst->getType(); 1740b57cec5SDimitry Andric (void)VecWidth; 1750b57cec5SDimitry Andric assert(VecWidth->isVectorTy() && 1760b57cec5SDimitry Andric DL.getTypeSizeInBits(VecWidth) >= 1770b57cec5SDimitry Andric DL.getTypeSizeInBits(SubVecTy) * NumSubVectors && 1780b57cec5SDimitry Andric "Invalid Inst-size!!!"); 1790b57cec5SDimitry Andric 1800b57cec5SDimitry Andric if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) { 1810b57cec5SDimitry Andric Value *Op0 = SVI->getOperand(0); 1820b57cec5SDimitry Andric Value *Op1 = SVI->getOperand(1); 1830b57cec5SDimitry Andric 1840b57cec5SDimitry Andric // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type. 1850b57cec5SDimitry Andric for (unsigned i = 0; i < NumSubVectors; ++i) 1860b57cec5SDimitry Andric DecomposedVectors.push_back( 1870b57cec5SDimitry Andric cast<ShuffleVectorInst>(Builder.CreateShuffleVector( 1880b57cec5SDimitry Andric Op0, Op1, 1895ffd83dbSDimitry Andric createSequentialMask(Indices[i], SubVecTy->getNumElements(), 1905ffd83dbSDimitry Andric 0)))); 1910b57cec5SDimitry Andric return; 1920b57cec5SDimitry Andric } 1930b57cec5SDimitry Andric 1940b57cec5SDimitry Andric // Decompose the load instruction. 1950b57cec5SDimitry Andric LoadInst *LI = cast<LoadInst>(VecInst); 19606c3fb27SDimitry Andric Type *VecBaseTy; 1970b57cec5SDimitry Andric unsigned int NumLoads = NumSubVectors; 1980b57cec5SDimitry Andric // In the case of stride 3 with a vector of 32 elements load the information 1990b57cec5SDimitry Andric // in the following way: 2000b57cec5SDimitry Andric // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] 2010b57cec5SDimitry Andric unsigned VecLength = DL.getTypeSizeInBits(VecWidth); 20206c3fb27SDimitry Andric Value *VecBasePtr = LI->getPointerOperand(); 2030b57cec5SDimitry Andric if (VecLength == 768 || VecLength == 1536) { 2045ffd83dbSDimitry Andric VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16); 2050b57cec5SDimitry Andric NumLoads = NumSubVectors * (VecLength / 384); 2060b57cec5SDimitry Andric } else { 2070b57cec5SDimitry Andric VecBaseTy = SubVecTy; 2080b57cec5SDimitry Andric } 2090b57cec5SDimitry Andric // Generate N loads of T type. 210e8d8bef9SDimitry Andric assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) && 2115ffd83dbSDimitry Andric "VecBaseTy's size must be a multiple of 8"); 2125ffd83dbSDimitry Andric const Align FirstAlignment = LI->getAlign(); 2135ffd83dbSDimitry Andric const Align SubsequentAlignment = commonAlignment( 214bdd1243dSDimitry Andric FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / 8); 2155ffd83dbSDimitry Andric Align Alignment = FirstAlignment; 2160b57cec5SDimitry Andric for (unsigned i = 0; i < NumLoads; i++) { 2170b57cec5SDimitry Andric // TODO: Support inbounds GEP. 2180b57cec5SDimitry Andric Value *NewBasePtr = 2190b57cec5SDimitry Andric Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i)); 2200b57cec5SDimitry Andric Instruction *NewLoad = 2215ffd83dbSDimitry Andric Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment); 2220b57cec5SDimitry Andric DecomposedVectors.push_back(NewLoad); 2235ffd83dbSDimitry Andric Alignment = SubsequentAlignment; 2240b57cec5SDimitry Andric } 2250b57cec5SDimitry Andric } 2260b57cec5SDimitry Andric 2270b57cec5SDimitry Andric // Changing the scale of the vector type by reducing the number of elements and 2280b57cec5SDimitry Andric // doubling the scalar size. 2290b57cec5SDimitry Andric static MVT scaleVectorType(MVT VT) { 2300b57cec5SDimitry Andric unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2; 2310b57cec5SDimitry Andric return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), 2320b57cec5SDimitry Andric VT.getVectorNumElements() / 2); 2330b57cec5SDimitry Andric } 2340b57cec5SDimitry Andric 2355ffd83dbSDimitry Andric static constexpr int Concat[] = { 2360b57cec5SDimitry Andric 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2370b57cec5SDimitry Andric 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 2380b57cec5SDimitry Andric 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 2390b57cec5SDimitry Andric 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; 2400b57cec5SDimitry Andric 2410b57cec5SDimitry Andric // genShuffleBland - Creates shuffle according to two vectors.This function is 2420b57cec5SDimitry Andric // only works on instructions with lane inside 256 registers. According to 2430b57cec5SDimitry Andric // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The 2440b57cec5SDimitry Andric // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'. 2450b57cec5SDimitry Andric // Where the 'LowOffset' refers to the first vector and the highOffset refers to 2460b57cec5SDimitry Andric // the second vector. 2470b57cec5SDimitry Andric // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20| 2480b57cec5SDimitry Andric // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25| 2490b57cec5SDimitry Andric // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31| 2500b57cec5SDimitry Andric // For the sequence to work as a mirror to the load. 2510b57cec5SDimitry Andric // We must consider the elements order as above. 2520b57cec5SDimitry Andric // In this function we are combining two types of shuffles. 2530b57cec5SDimitry Andric // The first one is vpshufed and the second is a type of "blend" shuffle. 2540b57cec5SDimitry Andric // By computing the shuffle on a sequence of 16 elements(one lane) and add the 2550b57cec5SDimitry Andric // correct offset. We are creating a vpsuffed + blend sequence between two 2560b57cec5SDimitry Andric // shuffles. 2575ffd83dbSDimitry Andric static void genShuffleBland(MVT VT, ArrayRef<int> Mask, 2585ffd83dbSDimitry Andric SmallVectorImpl<int> &Out, int LowOffset, 2590b57cec5SDimitry Andric int HighOffset) { 2600b57cec5SDimitry Andric assert(VT.getSizeInBits() >= 256 && 2610b57cec5SDimitry Andric "This function doesn't accept width smaller then 256"); 2620b57cec5SDimitry Andric unsigned NumOfElm = VT.getVectorNumElements(); 263bdd1243dSDimitry Andric for (int I : Mask) 264bdd1243dSDimitry Andric Out.push_back(I + LowOffset); 265bdd1243dSDimitry Andric for (int I : Mask) 266bdd1243dSDimitry Andric Out.push_back(I + HighOffset + NumOfElm); 2670b57cec5SDimitry Andric } 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric // reorderSubVector returns the data to is the original state. And de-facto is 2700b57cec5SDimitry Andric // the opposite of the function concatSubVector. 2710b57cec5SDimitry Andric 2720b57cec5SDimitry Andric // For VecElems = 16 2730b57cec5SDimitry Andric // Invec[0] - |0| TransposedMatrix[0] - |0| 2740b57cec5SDimitry Andric // Invec[1] - |1| => TransposedMatrix[1] - |1| 2750b57cec5SDimitry Andric // Invec[2] - |2| TransposedMatrix[2] - |2| 2760b57cec5SDimitry Andric 2770b57cec5SDimitry Andric // For VecElems = 32 2780b57cec5SDimitry Andric // Invec[0] - |0|3| TransposedMatrix[0] - |0|1| 2790b57cec5SDimitry Andric // Invec[1] - |1|4| => TransposedMatrix[1] - |2|3| 2800b57cec5SDimitry Andric // Invec[2] - |2|5| TransposedMatrix[2] - |4|5| 2810b57cec5SDimitry Andric 2820b57cec5SDimitry Andric // For VecElems = 64 2830b57cec5SDimitry Andric // Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 | 2840b57cec5SDimitry Andric // Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 | 2850b57cec5SDimitry Andric // Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11| 2860b57cec5SDimitry Andric 2870b57cec5SDimitry Andric static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix, 2885ffd83dbSDimitry Andric ArrayRef<Value *> Vec, ArrayRef<int> VPShuf, 2890b57cec5SDimitry Andric unsigned VecElems, unsigned Stride, 2905ffd83dbSDimitry Andric IRBuilder<> &Builder) { 2910b57cec5SDimitry Andric 2920b57cec5SDimitry Andric if (VecElems == 16) { 2930b57cec5SDimitry Andric for (unsigned i = 0; i < Stride; i++) 294e8d8bef9SDimitry Andric TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); 2950b57cec5SDimitry Andric return; 2960b57cec5SDimitry Andric } 2970b57cec5SDimitry Andric 2985ffd83dbSDimitry Andric SmallVector<int, 32> OptimizeShuf; 2990b57cec5SDimitry Andric Value *Temp[8]; 3000b57cec5SDimitry Andric 3010b57cec5SDimitry Andric for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) { 3020b57cec5SDimitry Andric genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16, 3030b57cec5SDimitry Andric (i + 1) / Stride * 16); 3040b57cec5SDimitry Andric Temp[i / 2] = Builder.CreateShuffleVector( 3050b57cec5SDimitry Andric Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); 3060b57cec5SDimitry Andric OptimizeShuf.clear(); 3070b57cec5SDimitry Andric } 3080b57cec5SDimitry Andric 3090b57cec5SDimitry Andric if (VecElems == 32) { 3100b57cec5SDimitry Andric std::copy(Temp, Temp + Stride, TransposedMatrix.begin()); 3110b57cec5SDimitry Andric return; 3125ffd83dbSDimitry Andric } else 3130b57cec5SDimitry Andric for (unsigned i = 0; i < Stride; i++) 3140b57cec5SDimitry Andric TransposedMatrix[i] = 3150b57cec5SDimitry Andric Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); 3160b57cec5SDimitry Andric } 3170b57cec5SDimitry Andric 3180b57cec5SDimitry Andric void X86InterleavedAccessGroup::interleave8bitStride4VF8( 3190b57cec5SDimitry Andric ArrayRef<Instruction *> Matrix, 3200b57cec5SDimitry Andric SmallVectorImpl<Value *> &TransposedMatrix) { 3210b57cec5SDimitry Andric // Assuming we start from the following vectors: 3220b57cec5SDimitry Andric // Matrix[0]= c0 c1 c2 c3 c4 ... c7 3230b57cec5SDimitry Andric // Matrix[1]= m0 m1 m2 m3 m4 ... m7 3240b57cec5SDimitry Andric // Matrix[2]= y0 y1 y2 y3 y4 ... y7 3250b57cec5SDimitry Andric // Matrix[3]= k0 k1 k2 k3 k4 ... k7 3260b57cec5SDimitry Andric 3270b57cec5SDimitry Andric MVT VT = MVT::v8i16; 3280b57cec5SDimitry Andric TransposedMatrix.resize(2); 3295ffd83dbSDimitry Andric SmallVector<int, 16> MaskLow; 3305ffd83dbSDimitry Andric SmallVector<int, 32> MaskLowTemp1, MaskLowWord; 3315ffd83dbSDimitry Andric SmallVector<int, 32> MaskHighTemp1, MaskHighWord; 3320b57cec5SDimitry Andric 3330b57cec5SDimitry Andric for (unsigned i = 0; i < 8; ++i) { 3340b57cec5SDimitry Andric MaskLow.push_back(i); 3350b57cec5SDimitry Andric MaskLow.push_back(i + 8); 3360b57cec5SDimitry Andric } 3370b57cec5SDimitry Andric 3385ffd83dbSDimitry Andric createUnpackShuffleMask(VT, MaskLowTemp1, true, false); 3395ffd83dbSDimitry Andric createUnpackShuffleMask(VT, MaskHighTemp1, false, false); 3405ffd83dbSDimitry Andric narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord); 3415ffd83dbSDimitry Andric narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord); 3420b57cec5SDimitry Andric // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7 3430b57cec5SDimitry Andric // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7 3440b57cec5SDimitry Andric Value *IntrVec1Low = 3450b57cec5SDimitry Andric Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); 3460b57cec5SDimitry Andric Value *IntrVec2Low = 3470b57cec5SDimitry Andric Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); 3480b57cec5SDimitry Andric 3490b57cec5SDimitry Andric // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 3500b57cec5SDimitry Andric // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7 3510b57cec5SDimitry Andric 3520b57cec5SDimitry Andric TransposedMatrix[0] = 3530b57cec5SDimitry Andric Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord); 3540b57cec5SDimitry Andric TransposedMatrix[1] = 3550b57cec5SDimitry Andric Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord); 3560b57cec5SDimitry Andric } 3570b57cec5SDimitry Andric 3580b57cec5SDimitry Andric void X86InterleavedAccessGroup::interleave8bitStride4( 3590b57cec5SDimitry Andric ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix, 3600b57cec5SDimitry Andric unsigned NumOfElm) { 3610b57cec5SDimitry Andric // Example: Assuming we start from the following vectors: 3620b57cec5SDimitry Andric // Matrix[0]= c0 c1 c2 c3 c4 ... c31 3630b57cec5SDimitry Andric // Matrix[1]= m0 m1 m2 m3 m4 ... m31 3640b57cec5SDimitry Andric // Matrix[2]= y0 y1 y2 y3 y4 ... y31 3650b57cec5SDimitry Andric // Matrix[3]= k0 k1 k2 k3 k4 ... k31 3660b57cec5SDimitry Andric 3670b57cec5SDimitry Andric MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm); 3680b57cec5SDimitry Andric MVT HalfVT = scaleVectorType(VT); 3690b57cec5SDimitry Andric 3700b57cec5SDimitry Andric TransposedMatrix.resize(4); 3715ffd83dbSDimitry Andric SmallVector<int, 32> MaskHigh; 3725ffd83dbSDimitry Andric SmallVector<int, 32> MaskLow; 3735ffd83dbSDimitry Andric SmallVector<int, 32> LowHighMask[2]; 3745ffd83dbSDimitry Andric SmallVector<int, 32> MaskHighTemp; 3755ffd83dbSDimitry Andric SmallVector<int, 32> MaskLowTemp; 3760b57cec5SDimitry Andric 3770b57cec5SDimitry Andric // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86 3780b57cec5SDimitry Andric // shuffle pattern. 3790b57cec5SDimitry Andric 3805ffd83dbSDimitry Andric createUnpackShuffleMask(VT, MaskLow, true, false); 3815ffd83dbSDimitry Andric createUnpackShuffleMask(VT, MaskHigh, false, false); 3820b57cec5SDimitry Andric 3830b57cec5SDimitry Andric // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86 3840b57cec5SDimitry Andric // shuffle pattern. 3850b57cec5SDimitry Andric 3865ffd83dbSDimitry Andric createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false); 3875ffd83dbSDimitry Andric createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false); 3885ffd83dbSDimitry Andric narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]); 3895ffd83dbSDimitry Andric narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]); 3900b57cec5SDimitry Andric 3910b57cec5SDimitry Andric // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23 3920b57cec5SDimitry Andric // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31 3930b57cec5SDimitry Andric // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23 3940b57cec5SDimitry Andric // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31 3950b57cec5SDimitry Andric Value *IntrVec[4]; 3960b57cec5SDimitry Andric 3970b57cec5SDimitry Andric IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); 3980b57cec5SDimitry Andric IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh); 3990b57cec5SDimitry Andric IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); 4000b57cec5SDimitry Andric IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh); 4010b57cec5SDimitry Andric 4020b57cec5SDimitry Andric // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23 4030b57cec5SDimitry Andric // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31 4040b57cec5SDimitry Andric // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19 4050b57cec5SDimitry Andric // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27 4060b57cec5SDimitry Andric 4070b57cec5SDimitry Andric Value *VecOut[4]; 4080b57cec5SDimitry Andric for (int i = 0; i < 4; i++) 4090b57cec5SDimitry Andric VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2], 4100b57cec5SDimitry Andric LowHighMask[i % 2]); 4110b57cec5SDimitry Andric 4120b57cec5SDimitry Andric // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7 4130b57cec5SDimitry Andric // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15 4140b57cec5SDimitry Andric // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23 4150b57cec5SDimitry Andric // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31 4160b57cec5SDimitry Andric 4170b57cec5SDimitry Andric if (VT == MVT::v16i8) { 4180b57cec5SDimitry Andric std::copy(VecOut, VecOut + 4, TransposedMatrix.begin()); 4190b57cec5SDimitry Andric return; 4200b57cec5SDimitry Andric } 4210b57cec5SDimitry Andric 422bdd1243dSDimitry Andric reorderSubVector(VT, TransposedMatrix, VecOut, ArrayRef(Concat, 16), NumOfElm, 423bdd1243dSDimitry Andric 4, Builder); 4240b57cec5SDimitry Andric } 4250b57cec5SDimitry Andric 4260b57cec5SDimitry Andric // createShuffleStride returns shuffle mask of size N. 4270b57cec5SDimitry Andric // The shuffle pattern is as following : 4280b57cec5SDimitry Andric // {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane), 4290b57cec5SDimitry Andric // (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),..., 4300b57cec5SDimitry Andric // (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)} 4310b57cec5SDimitry Andric // Where Lane is the # of lanes in a register: 4320b57cec5SDimitry Andric // VectorSize = 128 => Lane = 1 4330b57cec5SDimitry Andric // VectorSize = 256 => Lane = 2 4340b57cec5SDimitry Andric // For example shuffle pattern for VF 16 register size 256 -> lanes = 2 4350b57cec5SDimitry Andric // {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>} 4360b57cec5SDimitry Andric static void createShuffleStride(MVT VT, int Stride, 4375ffd83dbSDimitry Andric SmallVectorImpl<int> &Mask) { 4380b57cec5SDimitry Andric int VectorSize = VT.getSizeInBits(); 4390b57cec5SDimitry Andric int VF = VT.getVectorNumElements(); 4400b57cec5SDimitry Andric int LaneCount = std::max(VectorSize / 128, 1); 4410b57cec5SDimitry Andric for (int Lane = 0; Lane < LaneCount; Lane++) 4420b57cec5SDimitry Andric for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i) 4430b57cec5SDimitry Andric Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane); 4440b57cec5SDimitry Andric } 4450b57cec5SDimitry Andric 4460b57cec5SDimitry Andric // setGroupSize sets 'SizeInfo' to the size(number of elements) of group 4470b57cec5SDimitry Andric // inside mask a shuffleMask. A mask contains exactly 3 groups, where 4480b57cec5SDimitry Andric // each group is a monotonically increasing sequence with stride 3. 4490b57cec5SDimitry Andric // For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2} 4505ffd83dbSDimitry Andric static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) { 4510b57cec5SDimitry Andric int VectorSize = VT.getSizeInBits(); 4520b57cec5SDimitry Andric int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1); 4530b57cec5SDimitry Andric for (int i = 0, FirstGroupElement = 0; i < 3; i++) { 4540b57cec5SDimitry Andric int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0); 4550b57cec5SDimitry Andric SizeInfo.push_back(GroupSize); 4560b57cec5SDimitry Andric FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF; 4570b57cec5SDimitry Andric } 4580b57cec5SDimitry Andric } 4590b57cec5SDimitry Andric 4600b57cec5SDimitry Andric // DecodePALIGNRMask returns the shuffle mask of vpalign instruction. 4610b57cec5SDimitry Andric // vpalign works according to lanes 4620b57cec5SDimitry Andric // Where Lane is the # of lanes in a register: 4630b57cec5SDimitry Andric // VectorWide = 128 => Lane = 1 4640b57cec5SDimitry Andric // VectorWide = 256 => Lane = 2 4650b57cec5SDimitry Andric // For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}. 4660b57cec5SDimitry Andric // For Lane = 2 shuffle pattern is: 4670b57cec5SDimitry Andric // {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}. 4680b57cec5SDimitry Andric // Imm variable sets the offset amount. The result of the 4690b57cec5SDimitry Andric // function is stored inside ShuffleMask vector and it built as described in 4700b57cec5SDimitry Andric // the begin of the description. AlignDirection is a boolean that indicates the 4710b57cec5SDimitry Andric // direction of the alignment. (false - align to the "right" side while true - 4720b57cec5SDimitry Andric // align to the "left" side) 4730b57cec5SDimitry Andric static void DecodePALIGNRMask(MVT VT, unsigned Imm, 4745ffd83dbSDimitry Andric SmallVectorImpl<int> &ShuffleMask, 4750b57cec5SDimitry Andric bool AlignDirection = true, bool Unary = false) { 4760b57cec5SDimitry Andric unsigned NumElts = VT.getVectorNumElements(); 4770b57cec5SDimitry Andric unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1); 4780b57cec5SDimitry Andric unsigned NumLaneElts = NumElts / NumLanes; 4790b57cec5SDimitry Andric 4800b57cec5SDimitry Andric Imm = AlignDirection ? Imm : (NumLaneElts - Imm); 4810b57cec5SDimitry Andric unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8); 4820b57cec5SDimitry Andric 4830b57cec5SDimitry Andric for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 4840b57cec5SDimitry Andric for (unsigned i = 0; i != NumLaneElts; ++i) { 4850b57cec5SDimitry Andric unsigned Base = i + Offset; 4860b57cec5SDimitry Andric // if i+offset is out of this lane then we actually need the other source 4870b57cec5SDimitry Andric // If Unary the other source is the first source. 4880b57cec5SDimitry Andric if (Base >= NumLaneElts) 4890b57cec5SDimitry Andric Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts; 4900b57cec5SDimitry Andric ShuffleMask.push_back(Base + l); 4910b57cec5SDimitry Andric } 4920b57cec5SDimitry Andric } 4930b57cec5SDimitry Andric } 4940b57cec5SDimitry Andric 4950b57cec5SDimitry Andric // concatSubVector - The function rebuilds the data to a correct expected 4960b57cec5SDimitry Andric // order. An assumption(The shape of the matrix) was taken for the 4970b57cec5SDimitry Andric // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'. 4980b57cec5SDimitry Andric // This function ensures that the data is built in correct way for the lane 4990b57cec5SDimitry Andric // instructions. Each lane inside the vector is a 128-bit length. 5000b57cec5SDimitry Andric // 5010b57cec5SDimitry Andric // The 'InVec' argument contains the data in increasing order. In InVec[0] You 5020b57cec5SDimitry Andric // can find the first 128 bit data. The number of different lanes inside a 5030b57cec5SDimitry Andric // vector depends on the 'VecElems'.In general, the formula is 5040b57cec5SDimitry Andric // VecElems * type / 128. The size of the array 'InVec' depends and equal to 5050b57cec5SDimitry Andric // 'VecElems'. 5060b57cec5SDimitry Andric 5070b57cec5SDimitry Andric // For VecElems = 16 5080b57cec5SDimitry Andric // Invec[0] - |0| Vec[0] - |0| 5090b57cec5SDimitry Andric // Invec[1] - |1| => Vec[1] - |1| 5100b57cec5SDimitry Andric // Invec[2] - |2| Vec[2] - |2| 5110b57cec5SDimitry Andric 5120b57cec5SDimitry Andric // For VecElems = 32 5130b57cec5SDimitry Andric // Invec[0] - |0|1| Vec[0] - |0|3| 5140b57cec5SDimitry Andric // Invec[1] - |2|3| => Vec[1] - |1|4| 5150b57cec5SDimitry Andric // Invec[2] - |4|5| Vec[2] - |2|5| 5160b57cec5SDimitry Andric 5170b57cec5SDimitry Andric // For VecElems = 64 5180b57cec5SDimitry Andric // Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 | 5190b57cec5SDimitry Andric // Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10| 5200b57cec5SDimitry Andric // Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11| 5210b57cec5SDimitry Andric 5220b57cec5SDimitry Andric static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec, 5235ffd83dbSDimitry Andric unsigned VecElems, IRBuilder<> &Builder) { 5240b57cec5SDimitry Andric if (VecElems == 16) { 5250b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 5260b57cec5SDimitry Andric Vec[i] = InVec[i]; 5270b57cec5SDimitry Andric return; 5280b57cec5SDimitry Andric } 5290b57cec5SDimitry Andric 5300b57cec5SDimitry Andric for (unsigned j = 0; j < VecElems / 32; j++) 5310b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 5320b57cec5SDimitry Andric Vec[i + j * 3] = Builder.CreateShuffleVector( 533bdd1243dSDimitry Andric InVec[j * 6 + i], InVec[j * 6 + i + 3], ArrayRef(Concat, 32)); 5340b57cec5SDimitry Andric 5350b57cec5SDimitry Andric if (VecElems == 32) 5360b57cec5SDimitry Andric return; 5370b57cec5SDimitry Andric 5380b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 5390b57cec5SDimitry Andric Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat); 5400b57cec5SDimitry Andric } 5410b57cec5SDimitry Andric 5420b57cec5SDimitry Andric void X86InterleavedAccessGroup::deinterleave8bitStride3( 5430b57cec5SDimitry Andric ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, 5440b57cec5SDimitry Andric unsigned VecElems) { 5450b57cec5SDimitry Andric // Example: Assuming we start from the following vectors: 5460b57cec5SDimitry Andric // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2 5470b57cec5SDimitry Andric // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5 5480b57cec5SDimitry Andric // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7 5490b57cec5SDimitry Andric 5500b57cec5SDimitry Andric TransposedMatrix.resize(3); 5515ffd83dbSDimitry Andric SmallVector<int, 32> VPShuf; 5525ffd83dbSDimitry Andric SmallVector<int, 32> VPAlign[2]; 5535ffd83dbSDimitry Andric SmallVector<int, 32> VPAlign2; 5545ffd83dbSDimitry Andric SmallVector<int, 32> VPAlign3; 5555ffd83dbSDimitry Andric SmallVector<int, 3> GroupSize; 5560b57cec5SDimitry Andric Value *Vec[6], *TempVector[3]; 5570b57cec5SDimitry Andric 5580b57cec5SDimitry Andric MVT VT = MVT::getVT(Shuffles[0]->getType()); 5590b57cec5SDimitry Andric 5600b57cec5SDimitry Andric createShuffleStride(VT, 3, VPShuf); 5610b57cec5SDimitry Andric setGroupSize(VT, GroupSize); 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric for (int i = 0; i < 2; i++) 5640b57cec5SDimitry Andric DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false); 5650b57cec5SDimitry Andric 5660b57cec5SDimitry Andric DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true); 5670b57cec5SDimitry Andric DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true); 5680b57cec5SDimitry Andric 5690b57cec5SDimitry Andric concatSubVector(Vec, InVec, VecElems, Builder); 5700b57cec5SDimitry Andric // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 5710b57cec5SDimitry Andric // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 5720b57cec5SDimitry Andric // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 5730b57cec5SDimitry Andric 5740b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 575e8d8bef9SDimitry Andric Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); 5760b57cec5SDimitry Andric 5770b57cec5SDimitry Andric // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2 5780b57cec5SDimitry Andric // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5 5790b57cec5SDimitry Andric // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 5820b57cec5SDimitry Andric TempVector[i] = 5830b57cec5SDimitry Andric Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]); 5840b57cec5SDimitry Andric 5850b57cec5SDimitry Andric // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 5860b57cec5SDimitry Andric // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 5870b57cec5SDimitry Andric // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 5880b57cec5SDimitry Andric 5890b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 5900b57cec5SDimitry Andric Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i], 5910b57cec5SDimitry Andric VPAlign[1]); 5920b57cec5SDimitry Andric 5930b57cec5SDimitry Andric // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 5940b57cec5SDimitry Andric // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 5950b57cec5SDimitry Andric // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7 5960b57cec5SDimitry Andric 597e8d8bef9SDimitry Andric Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3); 598e8d8bef9SDimitry Andric TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2); 5990b57cec5SDimitry Andric TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec; 6000b57cec5SDimitry Andric TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2]; 6010b57cec5SDimitry Andric } 6020b57cec5SDimitry Andric 6030b57cec5SDimitry Andric // group2Shuffle reorder the shuffle stride back into continuous order. 6040b57cec5SDimitry Andric // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} => 6050b57cec5SDimitry Andric // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}. 6065ffd83dbSDimitry Andric static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask, 6075ffd83dbSDimitry Andric SmallVectorImpl<int> &Output) { 6080b57cec5SDimitry Andric int IndexGroup[3] = {0, 0, 0}; 6090b57cec5SDimitry Andric int Index = 0; 6100b57cec5SDimitry Andric int VectorWidth = VT.getSizeInBits(); 6110b57cec5SDimitry Andric int VF = VT.getVectorNumElements(); 6120b57cec5SDimitry Andric // Find the index of the different groups. 6130b57cec5SDimitry Andric int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1; 6140b57cec5SDimitry Andric for (int i = 0; i < 3; i++) { 6150b57cec5SDimitry Andric IndexGroup[(Index * 3) % (VF / Lane)] = Index; 6160b57cec5SDimitry Andric Index += Mask[i]; 6170b57cec5SDimitry Andric } 6180b57cec5SDimitry Andric // According to the index compute the convert mask. 6190b57cec5SDimitry Andric for (int i = 0; i < VF / Lane; i++) { 6200b57cec5SDimitry Andric Output.push_back(IndexGroup[i % 3]); 6210b57cec5SDimitry Andric IndexGroup[i % 3]++; 6220b57cec5SDimitry Andric } 6230b57cec5SDimitry Andric } 6240b57cec5SDimitry Andric 6250b57cec5SDimitry Andric void X86InterleavedAccessGroup::interleave8bitStride3( 6260b57cec5SDimitry Andric ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, 6270b57cec5SDimitry Andric unsigned VecElems) { 6280b57cec5SDimitry Andric // Example: Assuming we start from the following vectors: 6290b57cec5SDimitry Andric // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 6300b57cec5SDimitry Andric // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 6310b57cec5SDimitry Andric // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7 6320b57cec5SDimitry Andric 6330b57cec5SDimitry Andric TransposedMatrix.resize(3); 6345ffd83dbSDimitry Andric SmallVector<int, 3> GroupSize; 6355ffd83dbSDimitry Andric SmallVector<int, 32> VPShuf; 6365ffd83dbSDimitry Andric SmallVector<int, 32> VPAlign[3]; 6375ffd83dbSDimitry Andric SmallVector<int, 32> VPAlign2; 6385ffd83dbSDimitry Andric SmallVector<int, 32> VPAlign3; 6390b57cec5SDimitry Andric 6400b57cec5SDimitry Andric Value *Vec[3], *TempVector[3]; 6410b57cec5SDimitry Andric MVT VT = MVT::getVectorVT(MVT::i8, VecElems); 6420b57cec5SDimitry Andric 6430b57cec5SDimitry Andric setGroupSize(VT, GroupSize); 6440b57cec5SDimitry Andric 6450b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 6460b57cec5SDimitry Andric DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]); 6470b57cec5SDimitry Andric 6480b57cec5SDimitry Andric DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true); 6490b57cec5SDimitry Andric DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true); 6500b57cec5SDimitry Andric 6510b57cec5SDimitry Andric // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 6520b57cec5SDimitry Andric // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 6530b57cec5SDimitry Andric // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 6540b57cec5SDimitry Andric 655e8d8bef9SDimitry Andric Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2); 656e8d8bef9SDimitry Andric Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3); 6570b57cec5SDimitry Andric Vec[2] = InVec[2]; 6580b57cec5SDimitry Andric 6590b57cec5SDimitry Andric // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2 6600b57cec5SDimitry Andric // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5 6610b57cec5SDimitry Andric // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7 6620b57cec5SDimitry Andric 6630b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 6640b57cec5SDimitry Andric TempVector[i] = 6650b57cec5SDimitry Andric Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]); 6660b57cec5SDimitry Andric 6670b57cec5SDimitry Andric // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 6680b57cec5SDimitry Andric // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 6690b57cec5SDimitry Andric // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 6700b57cec5SDimitry Andric 6710b57cec5SDimitry Andric for (int i = 0; i < 3; i++) 6720b57cec5SDimitry Andric Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3], 6730b57cec5SDimitry Andric VPAlign[2]); 6740b57cec5SDimitry Andric 6750b57cec5SDimitry Andric // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2 6760b57cec5SDimitry Andric // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5 6770b57cec5SDimitry Andric // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7 6780b57cec5SDimitry Andric 6790b57cec5SDimitry Andric unsigned NumOfElm = VT.getVectorNumElements(); 6800b57cec5SDimitry Andric group2Shuffle(VT, GroupSize, VPShuf); 6810b57cec5SDimitry Andric reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder); 6820b57cec5SDimitry Andric } 6830b57cec5SDimitry Andric 6840b57cec5SDimitry Andric void X86InterleavedAccessGroup::transpose_4x4( 6850b57cec5SDimitry Andric ArrayRef<Instruction *> Matrix, 6860b57cec5SDimitry Andric SmallVectorImpl<Value *> &TransposedMatrix) { 6870b57cec5SDimitry Andric assert(Matrix.size() == 4 && "Invalid matrix size"); 6880b57cec5SDimitry Andric TransposedMatrix.resize(4); 6890b57cec5SDimitry Andric 6900b57cec5SDimitry Andric // dst = src1[0,1],src2[0,1] 6915ffd83dbSDimitry Andric static constexpr int IntMask1[] = {0, 1, 4, 5}; 692bdd1243dSDimitry Andric ArrayRef<int> Mask = ArrayRef(IntMask1, 4); 6930b57cec5SDimitry Andric Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); 6940b57cec5SDimitry Andric Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); 6950b57cec5SDimitry Andric 6960b57cec5SDimitry Andric // dst = src1[2,3],src2[2,3] 6975ffd83dbSDimitry Andric static constexpr int IntMask2[] = {2, 3, 6, 7}; 698bdd1243dSDimitry Andric Mask = ArrayRef(IntMask2, 4); 6990b57cec5SDimitry Andric Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); 7000b57cec5SDimitry Andric Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); 7010b57cec5SDimitry Andric 7020b57cec5SDimitry Andric // dst = src1[0],src2[0],src1[2],src2[2] 7035ffd83dbSDimitry Andric static constexpr int IntMask3[] = {0, 4, 2, 6}; 704bdd1243dSDimitry Andric Mask = ArrayRef(IntMask3, 4); 7050b57cec5SDimitry Andric TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); 7060b57cec5SDimitry Andric TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); 7070b57cec5SDimitry Andric 7080b57cec5SDimitry Andric // dst = src1[1],src2[1],src1[3],src2[3] 7095ffd83dbSDimitry Andric static constexpr int IntMask4[] = {1, 5, 3, 7}; 710bdd1243dSDimitry Andric Mask = ArrayRef(IntMask4, 4); 7110b57cec5SDimitry Andric TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); 7120b57cec5SDimitry Andric TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); 7130b57cec5SDimitry Andric } 7140b57cec5SDimitry Andric 7150b57cec5SDimitry Andric // Lowers this interleaved access group into X86-specific 7160b57cec5SDimitry Andric // instructions/intrinsics. 7170b57cec5SDimitry Andric bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { 7180b57cec5SDimitry Andric SmallVector<Instruction *, 4> DecomposedVectors; 7190b57cec5SDimitry Andric SmallVector<Value *, 4> TransposedVectors; 7205ffd83dbSDimitry Andric auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType()); 7210b57cec5SDimitry Andric 7220b57cec5SDimitry Andric if (isa<LoadInst>(Inst)) { 7235ffd83dbSDimitry Andric auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType()); 7245ffd83dbSDimitry Andric unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor; 7250b57cec5SDimitry Andric switch (NumSubVecElems) { 7260b57cec5SDimitry Andric default: 7270b57cec5SDimitry Andric return false; 7280b57cec5SDimitry Andric case 4: 7290b57cec5SDimitry Andric case 8: 7300b57cec5SDimitry Andric case 16: 7310b57cec5SDimitry Andric case 32: 7320b57cec5SDimitry Andric case 64: 733fe6060f1SDimitry Andric if (ShuffleTy->getNumElements() != NumSubVecElems) 734fe6060f1SDimitry Andric return false; 7350b57cec5SDimitry Andric break; 7360b57cec5SDimitry Andric } 7370b57cec5SDimitry Andric 738fe6060f1SDimitry Andric // Try to generate target-sized register(/instruction). 739fe6060f1SDimitry Andric decompose(Inst, Factor, ShuffleTy, DecomposedVectors); 740fe6060f1SDimitry Andric 741fe6060f1SDimitry Andric // Perform matrix-transposition in order to compute interleaved 742fe6060f1SDimitry Andric // results by generating some sort of (optimized) target-specific 743fe6060f1SDimitry Andric // instructions. 744fe6060f1SDimitry Andric 745fe6060f1SDimitry Andric if (NumSubVecElems == 4) 746fe6060f1SDimitry Andric transpose_4x4(DecomposedVectors, TransposedVectors); 747fe6060f1SDimitry Andric else 748fe6060f1SDimitry Andric deinterleave8bitStride3(DecomposedVectors, TransposedVectors, 749fe6060f1SDimitry Andric NumSubVecElems); 750fe6060f1SDimitry Andric 7510b57cec5SDimitry Andric // Now replace the unoptimized-interleaved-vectors with the 7520b57cec5SDimitry Andric // transposed-interleaved vectors. 7530b57cec5SDimitry Andric for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) 7540b57cec5SDimitry Andric Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); 7550b57cec5SDimitry Andric 7560b57cec5SDimitry Andric return true; 7570b57cec5SDimitry Andric } 7580b57cec5SDimitry Andric 7595ffd83dbSDimitry Andric Type *ShuffleEltTy = ShuffleTy->getElementType(); 7605ffd83dbSDimitry Andric unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor; 7610b57cec5SDimitry Andric 7620b57cec5SDimitry Andric // Lower the interleaved stores: 7630b57cec5SDimitry Andric // 1. Decompose the interleaved wide shuffle into individual shuffle 7640b57cec5SDimitry Andric // vectors. 7655ffd83dbSDimitry Andric decompose(Shuffles[0], Factor, 7665ffd83dbSDimitry Andric FixedVectorType::get(ShuffleEltTy, NumSubVecElems), 7670b57cec5SDimitry Andric DecomposedVectors); 7680b57cec5SDimitry Andric 7690b57cec5SDimitry Andric // 2. Transpose the interleaved-vectors into vectors of contiguous 7700b57cec5SDimitry Andric // elements. 7710b57cec5SDimitry Andric switch (NumSubVecElems) { 7720b57cec5SDimitry Andric case 4: 7730b57cec5SDimitry Andric transpose_4x4(DecomposedVectors, TransposedVectors); 7740b57cec5SDimitry Andric break; 7750b57cec5SDimitry Andric case 8: 7760b57cec5SDimitry Andric interleave8bitStride4VF8(DecomposedVectors, TransposedVectors); 7770b57cec5SDimitry Andric break; 7780b57cec5SDimitry Andric case 16: 7790b57cec5SDimitry Andric case 32: 7800b57cec5SDimitry Andric case 64: 7810b57cec5SDimitry Andric if (Factor == 4) 7820b57cec5SDimitry Andric interleave8bitStride4(DecomposedVectors, TransposedVectors, 7830b57cec5SDimitry Andric NumSubVecElems); 7840b57cec5SDimitry Andric if (Factor == 3) 7850b57cec5SDimitry Andric interleave8bitStride3(DecomposedVectors, TransposedVectors, 7860b57cec5SDimitry Andric NumSubVecElems); 7870b57cec5SDimitry Andric break; 7880b57cec5SDimitry Andric default: 7890b57cec5SDimitry Andric return false; 7900b57cec5SDimitry Andric } 7910b57cec5SDimitry Andric 7920b57cec5SDimitry Andric // 3. Concatenate the contiguous-vectors back into a wide vector. 7930b57cec5SDimitry Andric Value *WideVec = concatenateVectors(Builder, TransposedVectors); 7940b57cec5SDimitry Andric 7950b57cec5SDimitry Andric // 4. Generate a store instruction for wide-vec. 7960b57cec5SDimitry Andric StoreInst *SI = cast<StoreInst>(Inst); 7975ffd83dbSDimitry Andric Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign()); 7980b57cec5SDimitry Andric 7990b57cec5SDimitry Andric return true; 8000b57cec5SDimitry Andric } 8010b57cec5SDimitry Andric 8020b57cec5SDimitry Andric // Lower interleaved load(s) into target specific instructions/ 8030b57cec5SDimitry Andric // intrinsics. Lowering sequence varies depending on the vector-types, factor, 8040b57cec5SDimitry Andric // number of shuffles and ISA. 8050b57cec5SDimitry Andric // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. 8060b57cec5SDimitry Andric bool X86TargetLowering::lowerInterleavedLoad( 8070b57cec5SDimitry Andric LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 8080b57cec5SDimitry Andric ArrayRef<unsigned> Indices, unsigned Factor) const { 8090b57cec5SDimitry Andric assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 8100b57cec5SDimitry Andric "Invalid interleave factor"); 8110b57cec5SDimitry Andric assert(!Shuffles.empty() && "Empty shufflevector input"); 8120b57cec5SDimitry Andric assert(Shuffles.size() == Indices.size() && 8130b57cec5SDimitry Andric "Unmatched number of shufflevectors and indices"); 8140b57cec5SDimitry Andric 8150b57cec5SDimitry Andric // Create an interleaved access group. 8160b57cec5SDimitry Andric IRBuilder<> Builder(LI); 8170b57cec5SDimitry Andric X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, 8180b57cec5SDimitry Andric Builder); 8190b57cec5SDimitry Andric 8200b57cec5SDimitry Andric return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); 8210b57cec5SDimitry Andric } 8220b57cec5SDimitry Andric 8230b57cec5SDimitry Andric bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, 8240b57cec5SDimitry Andric ShuffleVectorInst *SVI, 8250b57cec5SDimitry Andric unsigned Factor) const { 8260b57cec5SDimitry Andric assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 8270b57cec5SDimitry Andric "Invalid interleave factor"); 8280b57cec5SDimitry Andric 8295ffd83dbSDimitry Andric assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor == 8305ffd83dbSDimitry Andric 0 && 8310b57cec5SDimitry Andric "Invalid interleaved store"); 8320b57cec5SDimitry Andric 8330b57cec5SDimitry Andric // Holds the indices of SVI that correspond to the starting index of each 8340b57cec5SDimitry Andric // interleaved shuffle. 8350b57cec5SDimitry Andric SmallVector<unsigned, 4> Indices; 8360b57cec5SDimitry Andric auto Mask = SVI->getShuffleMask(); 8370b57cec5SDimitry Andric for (unsigned i = 0; i < Factor; i++) 8380b57cec5SDimitry Andric Indices.push_back(Mask[i]); 8390b57cec5SDimitry Andric 840bdd1243dSDimitry Andric ArrayRef<ShuffleVectorInst *> Shuffles = ArrayRef(SVI); 8410b57cec5SDimitry Andric 8420b57cec5SDimitry Andric // Create an interleaved access group. 8430b57cec5SDimitry Andric IRBuilder<> Builder(SI); 8440b57cec5SDimitry Andric X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget, 8450b57cec5SDimitry Andric Builder); 8460b57cec5SDimitry Andric 8470b57cec5SDimitry Andric return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); 8480b57cec5SDimitry Andric } 849