1 //===- X86InterleavedAccess.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file contains the X86 implementation of the interleaved accesses 11 /// optimization generating X86-specific instructions/intrinsics for 12 /// interleaved access groups. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "X86ISelLowering.h" 17 #include "X86Subtarget.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/Analysis/VectorUtils.h" 21 #include "llvm/CodeGenTypes/MachineValueType.h" 22 #include "llvm/IR/DataLayout.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/IR/IRBuilder.h" 25 #include "llvm/IR/Instruction.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/Type.h" 28 #include "llvm/IR/Value.h" 29 #include "llvm/Support/Casting.h" 30 #include <algorithm> 31 #include <cassert> 32 #include <cmath> 33 34 using namespace llvm; 35 36 namespace { 37 38 /// This class holds necessary information to represent an interleaved 39 /// access group and supports utilities to lower the group into 40 /// X86-specific instructions/intrinsics. 41 /// E.g. A group of interleaving access loads (Factor = 2; accessing every 42 /// other element) 43 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 44 /// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> 45 /// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> 46 class X86InterleavedAccessGroup { 47 /// Reference to the wide-load instruction of an interleaved access 48 /// group. 49 Instruction *const Inst; 50 51 /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'. 52 ArrayRef<ShuffleVectorInst *> Shuffles; 53 54 /// Reference to the starting index of each user-shuffle. 55 ArrayRef<unsigned> Indices; 56 57 /// Reference to the interleaving stride in terms of elements. 58 const unsigned Factor; 59 60 /// Reference to the underlying target. 61 const X86Subtarget &Subtarget; 62 63 const DataLayout &DL; 64 65 IRBuilder<> &Builder; 66 67 /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors 68 /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors. 69 void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T, 70 SmallVectorImpl<Instruction *> &DecomposedVectors); 71 72 /// Performs matrix transposition on a 4x4 matrix \p InputVectors and 73 /// returns the transposed-vectors in \p TransposedVectors. 74 /// E.g. 75 /// InputVectors: 76 /// In-V0 = p1, p2, p3, p4 77 /// In-V1 = q1, q2, q3, q4 78 /// In-V2 = r1, r2, r3, r4 79 /// In-V3 = s1, s2, s3, s4 80 /// OutputVectors: 81 /// Out-V0 = p1, q1, r1, s1 82 /// Out-V1 = p2, q2, r2, s2 83 /// Out-V2 = p3, q3, r3, s3 84 /// Out-V3 = P4, q4, r4, s4 85 void transpose_4x4(ArrayRef<Instruction *> InputVectors, 86 SmallVectorImpl<Value *> &TransposedMatrix); 87 void interleave8bitStride4(ArrayRef<Instruction *> InputVectors, 88 SmallVectorImpl<Value *> &TransposedMatrix, 89 unsigned NumSubVecElems); 90 void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors, 91 SmallVectorImpl<Value *> &TransposedMatrix); 92 void interleave8bitStride3(ArrayRef<Instruction *> InputVectors, 93 SmallVectorImpl<Value *> &TransposedMatrix, 94 unsigned NumSubVecElems); 95 void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors, 96 SmallVectorImpl<Value *> &TransposedMatrix, 97 unsigned NumSubVecElems); 98 99 public: 100 /// In order to form an interleaved access group X86InterleavedAccessGroup 101 /// requires a wide-load instruction \p 'I', a group of interleaved-vectors 102 /// \p Shuffs, reference to the first indices of each interleaved-vector 103 /// \p 'Ind' and the interleaving stride factor \p F. In order to generate 104 /// X86-specific instructions/intrinsics it also requires the underlying 105 /// target information \p STarget. 106 explicit X86InterleavedAccessGroup(Instruction *I, 107 ArrayRef<ShuffleVectorInst *> Shuffs, 108 ArrayRef<unsigned> Ind, const unsigned F, 109 const X86Subtarget &STarget, 110 IRBuilder<> &B) 111 : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget), 112 DL(Inst->getDataLayout()), Builder(B) {} 113 114 /// Returns true if this interleaved access group can be lowered into 115 /// x86-specific instructions/intrinsics, false otherwise. 116 bool isSupported() const; 117 118 /// Lowers this interleaved access group into X86-specific 119 /// instructions/intrinsics. 120 bool lowerIntoOptimizedSequence(); 121 }; 122 123 } // end anonymous namespace 124 125 bool X86InterleavedAccessGroup::isSupported() const { 126 VectorType *ShuffleVecTy = Shuffles[0]->getType(); 127 Type *ShuffleEltTy = ShuffleVecTy->getElementType(); 128 unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); 129 unsigned WideInstSize; 130 131 // Currently, lowering is supported for the following vectors: 132 // Stride 4: 133 // 1. Store and load of 4-element vectors of 64 bits on AVX. 134 // 2. Store of 16/32-element vectors of 8 bits on AVX. 135 // Stride 3: 136 // 1. Load of 16/32-element vectors of 8 bits on AVX. 137 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3)) 138 return false; 139 140 if (isa<LoadInst>(Inst)) { 141 WideInstSize = DL.getTypeSizeInBits(Inst->getType()); 142 if (cast<LoadInst>(Inst)->getPointerAddressSpace()) 143 return false; 144 } else 145 WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); 146 147 // We support shuffle represents stride 4 for byte type with size of 148 // WideInstSize. 149 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) 150 return true; 151 152 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 && 153 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 || 154 WideInstSize == 2048)) 155 return true; 156 157 if (ShuffleElemSize == 8 && Factor == 3 && 158 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536)) 159 return true; 160 161 return false; 162 } 163 164 void X86InterleavedAccessGroup::decompose( 165 Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy, 166 SmallVectorImpl<Instruction *> &DecomposedVectors) { 167 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) && 168 "Expected Load or Shuffle"); 169 170 Type *VecWidth = VecInst->getType(); 171 (void)VecWidth; 172 assert(VecWidth->isVectorTy() && 173 DL.getTypeSizeInBits(VecWidth) >= 174 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors && 175 "Invalid Inst-size!!!"); 176 177 if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) { 178 Value *Op0 = SVI->getOperand(0); 179 Value *Op1 = SVI->getOperand(1); 180 181 // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type. 182 for (unsigned i = 0; i < NumSubVectors; ++i) 183 DecomposedVectors.push_back( 184 cast<ShuffleVectorInst>(Builder.CreateShuffleVector( 185 Op0, Op1, 186 createSequentialMask(Indices[i], SubVecTy->getNumElements(), 187 0)))); 188 return; 189 } 190 191 // Decompose the load instruction. 192 LoadInst *LI = cast<LoadInst>(VecInst); 193 Type *VecBaseTy; 194 unsigned int NumLoads = NumSubVectors; 195 // In the case of stride 3 with a vector of 32 elements load the information 196 // in the following way: 197 // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] 198 unsigned VecLength = DL.getTypeSizeInBits(VecWidth); 199 Value *VecBasePtr = LI->getPointerOperand(); 200 if (VecLength == 768 || VecLength == 1536) { 201 VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16); 202 NumLoads = NumSubVectors * (VecLength / 384); 203 } else { 204 VecBaseTy = SubVecTy; 205 } 206 // Generate N loads of T type. 207 assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) && 208 "VecBaseTy's size must be a multiple of 8"); 209 const Align FirstAlignment = LI->getAlign(); 210 const Align SubsequentAlignment = commonAlignment( 211 FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / 8); 212 Align Alignment = FirstAlignment; 213 for (unsigned i = 0; i < NumLoads; i++) { 214 // TODO: Support inbounds GEP. 215 Value *NewBasePtr = 216 Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i)); 217 Instruction *NewLoad = 218 Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment); 219 DecomposedVectors.push_back(NewLoad); 220 Alignment = SubsequentAlignment; 221 } 222 } 223 224 // Changing the scale of the vector type by reducing the number of elements and 225 // doubling the scalar size. 226 static MVT scaleVectorType(MVT VT) { 227 unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2; 228 return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), 229 VT.getVectorNumElements() / 2); 230 } 231 232 static constexpr int Concat[] = { 233 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 234 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 235 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 236 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; 237 238 // genShuffleBland - Creates shuffle according to two vectors.This function is 239 // only works on instructions with lane inside 256 registers. According to 240 // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The 241 // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'. 242 // Where the 'LowOffset' refers to the first vector and the highOffset refers to 243 // the second vector. 244 // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20| 245 // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25| 246 // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31| 247 // For the sequence to work as a mirror to the load. 248 // We must consider the elements order as above. 249 // In this function we are combining two types of shuffles. 250 // The first one is vpshufed and the second is a type of "blend" shuffle. 251 // By computing the shuffle on a sequence of 16 elements(one lane) and add the 252 // correct offset. We are creating a vpsuffed + blend sequence between two 253 // shuffles. 254 static void genShuffleBland(MVT VT, ArrayRef<int> Mask, 255 SmallVectorImpl<int> &Out, int LowOffset, 256 int HighOffset) { 257 assert(VT.getSizeInBits() >= 256 && 258 "This function doesn't accept width smaller then 256"); 259 unsigned NumOfElm = VT.getVectorNumElements(); 260 for (int I : Mask) 261 Out.push_back(I + LowOffset); 262 for (int I : Mask) 263 Out.push_back(I + HighOffset + NumOfElm); 264 } 265 266 // reorderSubVector returns the data to is the original state. And de-facto is 267 // the opposite of the function concatSubVector. 268 269 // For VecElems = 16 270 // Invec[0] - |0| TransposedMatrix[0] - |0| 271 // Invec[1] - |1| => TransposedMatrix[1] - |1| 272 // Invec[2] - |2| TransposedMatrix[2] - |2| 273 274 // For VecElems = 32 275 // Invec[0] - |0|3| TransposedMatrix[0] - |0|1| 276 // Invec[1] - |1|4| => TransposedMatrix[1] - |2|3| 277 // Invec[2] - |2|5| TransposedMatrix[2] - |4|5| 278 279 // For VecElems = 64 280 // Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 | 281 // Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 | 282 // Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11| 283 284 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix, 285 ArrayRef<Value *> Vec, ArrayRef<int> VPShuf, 286 unsigned VecElems, unsigned Stride, 287 IRBuilder<> &Builder) { 288 289 if (VecElems == 16) { 290 for (unsigned i = 0; i < Stride; i++) 291 TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); 292 return; 293 } 294 295 SmallVector<int, 32> OptimizeShuf; 296 Value *Temp[8]; 297 298 for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) { 299 genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16, 300 (i + 1) / Stride * 16); 301 Temp[i / 2] = Builder.CreateShuffleVector( 302 Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); 303 OptimizeShuf.clear(); 304 } 305 306 if (VecElems == 32) { 307 std::copy(Temp, Temp + Stride, TransposedMatrix.begin()); 308 return; 309 } else 310 for (unsigned i = 0; i < Stride; i++) 311 TransposedMatrix[i] = 312 Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); 313 } 314 315 void X86InterleavedAccessGroup::interleave8bitStride4VF8( 316 ArrayRef<Instruction *> Matrix, 317 SmallVectorImpl<Value *> &TransposedMatrix) { 318 // Assuming we start from the following vectors: 319 // Matrix[0]= c0 c1 c2 c3 c4 ... c7 320 // Matrix[1]= m0 m1 m2 m3 m4 ... m7 321 // Matrix[2]= y0 y1 y2 y3 y4 ... y7 322 // Matrix[3]= k0 k1 k2 k3 k4 ... k7 323 324 MVT VT = MVT::v8i16; 325 TransposedMatrix.resize(2); 326 SmallVector<int, 16> MaskLow; 327 SmallVector<int, 32> MaskLowTemp1, MaskLowWord; 328 SmallVector<int, 32> MaskHighTemp1, MaskHighWord; 329 330 for (unsigned i = 0; i < 8; ++i) { 331 MaskLow.push_back(i); 332 MaskLow.push_back(i + 8); 333 } 334 335 createUnpackShuffleMask(VT, MaskLowTemp1, true, false); 336 createUnpackShuffleMask(VT, MaskHighTemp1, false, false); 337 narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord); 338 narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord); 339 // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7 340 // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7 341 Value *IntrVec1Low = 342 Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); 343 Value *IntrVec2Low = 344 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); 345 346 // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 347 // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7 348 349 TransposedMatrix[0] = 350 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord); 351 TransposedMatrix[1] = 352 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord); 353 } 354 355 void X86InterleavedAccessGroup::interleave8bitStride4( 356 ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix, 357 unsigned NumOfElm) { 358 // Example: Assuming we start from the following vectors: 359 // Matrix[0]= c0 c1 c2 c3 c4 ... c31 360 // Matrix[1]= m0 m1 m2 m3 m4 ... m31 361 // Matrix[2]= y0 y1 y2 y3 y4 ... y31 362 // Matrix[3]= k0 k1 k2 k3 k4 ... k31 363 364 MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm); 365 MVT HalfVT = scaleVectorType(VT); 366 367 TransposedMatrix.resize(4); 368 SmallVector<int, 32> MaskHigh; 369 SmallVector<int, 32> MaskLow; 370 SmallVector<int, 32> LowHighMask[2]; 371 SmallVector<int, 32> MaskHighTemp; 372 SmallVector<int, 32> MaskLowTemp; 373 374 // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86 375 // shuffle pattern. 376 377 createUnpackShuffleMask(VT, MaskLow, true, false); 378 createUnpackShuffleMask(VT, MaskHigh, false, false); 379 380 // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86 381 // shuffle pattern. 382 383 createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false); 384 createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false); 385 narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]); 386 narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]); 387 388 // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23 389 // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31 390 // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23 391 // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31 392 Value *IntrVec[4]; 393 394 IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); 395 IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh); 396 IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); 397 IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh); 398 399 // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23 400 // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31 401 // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19 402 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27 403 404 Value *VecOut[4]; 405 for (int i = 0; i < 4; i++) 406 VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2], 407 LowHighMask[i % 2]); 408 409 // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7 410 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15 411 // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23 412 // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31 413 414 if (VT == MVT::v16i8) { 415 std::copy(VecOut, VecOut + 4, TransposedMatrix.begin()); 416 return; 417 } 418 419 reorderSubVector(VT, TransposedMatrix, VecOut, ArrayRef(Concat, 16), NumOfElm, 420 4, Builder); 421 } 422 423 // createShuffleStride returns shuffle mask of size N. 424 // The shuffle pattern is as following : 425 // {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane), 426 // (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),..., 427 // (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)} 428 // Where Lane is the # of lanes in a register: 429 // VectorSize = 128 => Lane = 1 430 // VectorSize = 256 => Lane = 2 431 // For example shuffle pattern for VF 16 register size 256 -> lanes = 2 432 // {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>} 433 static void createShuffleStride(MVT VT, int Stride, 434 SmallVectorImpl<int> &Mask) { 435 int VectorSize = VT.getSizeInBits(); 436 int VF = VT.getVectorNumElements(); 437 int LaneCount = std::max(VectorSize / 128, 1); 438 for (int Lane = 0; Lane < LaneCount; Lane++) 439 for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i) 440 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane); 441 } 442 443 // setGroupSize sets 'SizeInfo' to the size(number of elements) of group 444 // inside mask a shuffleMask. A mask contains exactly 3 groups, where 445 // each group is a monotonically increasing sequence with stride 3. 446 // For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2} 447 static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) { 448 int VectorSize = VT.getSizeInBits(); 449 int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1); 450 for (int i = 0, FirstGroupElement = 0; i < 3; i++) { 451 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0); 452 SizeInfo.push_back(GroupSize); 453 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF; 454 } 455 } 456 457 // DecodePALIGNRMask returns the shuffle mask of vpalign instruction. 458 // vpalign works according to lanes 459 // Where Lane is the # of lanes in a register: 460 // VectorWide = 128 => Lane = 1 461 // VectorWide = 256 => Lane = 2 462 // For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}. 463 // For Lane = 2 shuffle pattern is: 464 // {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}. 465 // Imm variable sets the offset amount. The result of the 466 // function is stored inside ShuffleMask vector and it built as described in 467 // the begin of the description. AlignDirection is a boolean that indicates the 468 // direction of the alignment. (false - align to the "right" side while true - 469 // align to the "left" side) 470 static void DecodePALIGNRMask(MVT VT, unsigned Imm, 471 SmallVectorImpl<int> &ShuffleMask, 472 bool AlignDirection = true, bool Unary = false) { 473 unsigned NumElts = VT.getVectorNumElements(); 474 unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1); 475 unsigned NumLaneElts = NumElts / NumLanes; 476 477 Imm = AlignDirection ? Imm : (NumLaneElts - Imm); 478 unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8); 479 480 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 481 for (unsigned i = 0; i != NumLaneElts; ++i) { 482 unsigned Base = i + Offset; 483 // if i+offset is out of this lane then we actually need the other source 484 // If Unary the other source is the first source. 485 if (Base >= NumLaneElts) 486 Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts; 487 ShuffleMask.push_back(Base + l); 488 } 489 } 490 } 491 492 // concatSubVector - The function rebuilds the data to a correct expected 493 // order. An assumption(The shape of the matrix) was taken for the 494 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'. 495 // This function ensures that the data is built in correct way for the lane 496 // instructions. Each lane inside the vector is a 128-bit length. 497 // 498 // The 'InVec' argument contains the data in increasing order. In InVec[0] You 499 // can find the first 128 bit data. The number of different lanes inside a 500 // vector depends on the 'VecElems'.In general, the formula is 501 // VecElems * type / 128. The size of the array 'InVec' depends and equal to 502 // 'VecElems'. 503 504 // For VecElems = 16 505 // Invec[0] - |0| Vec[0] - |0| 506 // Invec[1] - |1| => Vec[1] - |1| 507 // Invec[2] - |2| Vec[2] - |2| 508 509 // For VecElems = 32 510 // Invec[0] - |0|1| Vec[0] - |0|3| 511 // Invec[1] - |2|3| => Vec[1] - |1|4| 512 // Invec[2] - |4|5| Vec[2] - |2|5| 513 514 // For VecElems = 64 515 // Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 | 516 // Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10| 517 // Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11| 518 519 static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec, 520 unsigned VecElems, IRBuilder<> &Builder) { 521 if (VecElems == 16) { 522 for (int i = 0; i < 3; i++) 523 Vec[i] = InVec[i]; 524 return; 525 } 526 527 for (unsigned j = 0; j < VecElems / 32; j++) 528 for (int i = 0; i < 3; i++) 529 Vec[i + j * 3] = Builder.CreateShuffleVector( 530 InVec[j * 6 + i], InVec[j * 6 + i + 3], ArrayRef(Concat, 32)); 531 532 if (VecElems == 32) 533 return; 534 535 for (int i = 0; i < 3; i++) 536 Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat); 537 } 538 539 void X86InterleavedAccessGroup::deinterleave8bitStride3( 540 ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, 541 unsigned VecElems) { 542 // Example: Assuming we start from the following vectors: 543 // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2 544 // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5 545 // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7 546 547 TransposedMatrix.resize(3); 548 SmallVector<int, 32> VPShuf; 549 SmallVector<int, 32> VPAlign[2]; 550 SmallVector<int, 32> VPAlign2; 551 SmallVector<int, 32> VPAlign3; 552 SmallVector<int, 3> GroupSize; 553 Value *Vec[6], *TempVector[3]; 554 555 MVT VT = MVT::getVT(Shuffles[0]->getType()); 556 557 createShuffleStride(VT, 3, VPShuf); 558 setGroupSize(VT, GroupSize); 559 560 for (int i = 0; i < 2; i++) 561 DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false); 562 563 DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true); 564 DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true); 565 566 concatSubVector(Vec, InVec, VecElems, Builder); 567 // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 568 // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 569 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 570 571 for (int i = 0; i < 3; i++) 572 Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); 573 574 // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2 575 // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5 576 // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7 577 578 for (int i = 0; i < 3; i++) 579 TempVector[i] = 580 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]); 581 582 // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 583 // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 584 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 585 586 for (int i = 0; i < 3; i++) 587 Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i], 588 VPAlign[1]); 589 590 // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 591 // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 592 // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7 593 594 Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3); 595 TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2); 596 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec; 597 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2]; 598 } 599 600 // group2Shuffle reorder the shuffle stride back into continuous order. 601 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} => 602 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}. 603 static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask, 604 SmallVectorImpl<int> &Output) { 605 int IndexGroup[3] = {0, 0, 0}; 606 int Index = 0; 607 int VectorWidth = VT.getSizeInBits(); 608 int VF = VT.getVectorNumElements(); 609 // Find the index of the different groups. 610 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1; 611 for (int i = 0; i < 3; i++) { 612 IndexGroup[(Index * 3) % (VF / Lane)] = Index; 613 Index += Mask[i]; 614 } 615 // According to the index compute the convert mask. 616 for (int i = 0; i < VF / Lane; i++) { 617 Output.push_back(IndexGroup[i % 3]); 618 IndexGroup[i % 3]++; 619 } 620 } 621 622 void X86InterleavedAccessGroup::interleave8bitStride3( 623 ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, 624 unsigned VecElems) { 625 // Example: Assuming we start from the following vectors: 626 // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 627 // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 628 // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7 629 630 TransposedMatrix.resize(3); 631 SmallVector<int, 3> GroupSize; 632 SmallVector<int, 32> VPShuf; 633 SmallVector<int, 32> VPAlign[3]; 634 SmallVector<int, 32> VPAlign2; 635 SmallVector<int, 32> VPAlign3; 636 637 Value *Vec[3], *TempVector[3]; 638 MVT VT = MVT::getVectorVT(MVT::i8, VecElems); 639 640 setGroupSize(VT, GroupSize); 641 642 for (int i = 0; i < 3; i++) 643 DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]); 644 645 DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true); 646 DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true); 647 648 // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 649 // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 650 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 651 652 Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2); 653 Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3); 654 Vec[2] = InVec[2]; 655 656 // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2 657 // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5 658 // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7 659 660 for (int i = 0; i < 3; i++) 661 TempVector[i] = 662 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]); 663 664 // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 665 // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 666 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 667 668 for (int i = 0; i < 3; i++) 669 Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3], 670 VPAlign[2]); 671 672 // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2 673 // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5 674 // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7 675 676 unsigned NumOfElm = VT.getVectorNumElements(); 677 group2Shuffle(VT, GroupSize, VPShuf); 678 reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder); 679 } 680 681 void X86InterleavedAccessGroup::transpose_4x4( 682 ArrayRef<Instruction *> Matrix, 683 SmallVectorImpl<Value *> &TransposedMatrix) { 684 assert(Matrix.size() == 4 && "Invalid matrix size"); 685 TransposedMatrix.resize(4); 686 687 // dst = src1[0,1],src2[0,1] 688 static constexpr int IntMask1[] = {0, 1, 4, 5}; 689 ArrayRef<int> Mask = ArrayRef(IntMask1, 4); 690 Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); 691 Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); 692 693 // dst = src1[2,3],src2[2,3] 694 static constexpr int IntMask2[] = {2, 3, 6, 7}; 695 Mask = ArrayRef(IntMask2, 4); 696 Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); 697 Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); 698 699 // dst = src1[0],src2[0],src1[2],src2[2] 700 static constexpr int IntMask3[] = {0, 4, 2, 6}; 701 Mask = ArrayRef(IntMask3, 4); 702 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); 703 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); 704 705 // dst = src1[1],src2[1],src1[3],src2[3] 706 static constexpr int IntMask4[] = {1, 5, 3, 7}; 707 Mask = ArrayRef(IntMask4, 4); 708 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); 709 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); 710 } 711 712 // Lowers this interleaved access group into X86-specific 713 // instructions/intrinsics. 714 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { 715 SmallVector<Instruction *, 4> DecomposedVectors; 716 SmallVector<Value *, 4> TransposedVectors; 717 auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType()); 718 719 if (isa<LoadInst>(Inst)) { 720 auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType()); 721 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor; 722 switch (NumSubVecElems) { 723 default: 724 return false; 725 case 4: 726 case 8: 727 case 16: 728 case 32: 729 case 64: 730 if (ShuffleTy->getNumElements() != NumSubVecElems) 731 return false; 732 break; 733 } 734 735 // Try to generate target-sized register(/instruction). 736 decompose(Inst, Factor, ShuffleTy, DecomposedVectors); 737 738 // Perform matrix-transposition in order to compute interleaved 739 // results by generating some sort of (optimized) target-specific 740 // instructions. 741 742 if (NumSubVecElems == 4) 743 transpose_4x4(DecomposedVectors, TransposedVectors); 744 else 745 deinterleave8bitStride3(DecomposedVectors, TransposedVectors, 746 NumSubVecElems); 747 748 // Now replace the unoptimized-interleaved-vectors with the 749 // transposed-interleaved vectors. 750 for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) 751 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); 752 753 return true; 754 } 755 756 Type *ShuffleEltTy = ShuffleTy->getElementType(); 757 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor; 758 759 // Lower the interleaved stores: 760 // 1. Decompose the interleaved wide shuffle into individual shuffle 761 // vectors. 762 decompose(Shuffles[0], Factor, 763 FixedVectorType::get(ShuffleEltTy, NumSubVecElems), 764 DecomposedVectors); 765 766 // 2. Transpose the interleaved-vectors into vectors of contiguous 767 // elements. 768 switch (NumSubVecElems) { 769 case 4: 770 transpose_4x4(DecomposedVectors, TransposedVectors); 771 break; 772 case 8: 773 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors); 774 break; 775 case 16: 776 case 32: 777 case 64: 778 if (Factor == 4) 779 interleave8bitStride4(DecomposedVectors, TransposedVectors, 780 NumSubVecElems); 781 if (Factor == 3) 782 interleave8bitStride3(DecomposedVectors, TransposedVectors, 783 NumSubVecElems); 784 break; 785 default: 786 return false; 787 } 788 789 // 3. Concatenate the contiguous-vectors back into a wide vector. 790 Value *WideVec = concatenateVectors(Builder, TransposedVectors); 791 792 // 4. Generate a store instruction for wide-vec. 793 StoreInst *SI = cast<StoreInst>(Inst); 794 Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign()); 795 796 return true; 797 } 798 799 // Lower interleaved load(s) into target specific instructions/ 800 // intrinsics. Lowering sequence varies depending on the vector-types, factor, 801 // number of shuffles and ISA. 802 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. 803 bool X86TargetLowering::lowerInterleavedLoad( 804 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 805 ArrayRef<unsigned> Indices, unsigned Factor) const { 806 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 807 "Invalid interleave factor"); 808 assert(!Shuffles.empty() && "Empty shufflevector input"); 809 assert(Shuffles.size() == Indices.size() && 810 "Unmatched number of shufflevectors and indices"); 811 812 // Create an interleaved access group. 813 IRBuilder<> Builder(LI); 814 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, 815 Builder); 816 817 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); 818 } 819 820 bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, 821 ShuffleVectorInst *SVI, 822 unsigned Factor) const { 823 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 824 "Invalid interleave factor"); 825 826 assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor == 827 0 && 828 "Invalid interleaved store"); 829 830 // Holds the indices of SVI that correspond to the starting index of each 831 // interleaved shuffle. 832 SmallVector<unsigned, 4> Indices; 833 auto Mask = SVI->getShuffleMask(); 834 for (unsigned i = 0; i < Factor; i++) 835 Indices.push_back(Mask[i]); 836 837 ArrayRef<ShuffleVectorInst *> Shuffles = ArrayRef(SVI); 838 839 // Create an interleaved access group. 840 IRBuilder<> Builder(SI); 841 X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget, 842 Builder); 843 844 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); 845 } 846