1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 #include "llvm/Transforms/Vectorize.h"
19 #include "llvm/ADT/MapVector.h"
20 #include "llvm/ADT/PostOrderIterator.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/ADT/Optional.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/AliasAnalysis.h"
25 #include "llvm/Analysis/AssumptionCache.h"
26 #include "llvm/Analysis/CodeMetrics.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/Analysis/ScalarEvolution.h"
29 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/Analysis/ValueTracking.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/NoFolder.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/IR/Verifier.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include "llvm/Transforms/Utils/VectorUtils.h"
47 #include <algorithm>
48 #include <map>
49 #include <memory>
50
51 using namespace llvm;
52
53 #define SV_NAME "slp-vectorizer"
54 #define DEBUG_TYPE "SLP"
55
56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
57
58 static cl::opt<int>
59 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
60 cl::desc("Only vectorize if you gain more than this "
61 "number "));
62
63 static cl::opt<bool>
64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
65 cl::desc("Attempt to vectorize horizontal reductions"));
66
67 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
68 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
69 cl::desc(
70 "Attempt to vectorize horizontal reductions feeding into a store"));
71
72 namespace {
73
74 static const unsigned MinVecRegSize = 128;
75
76 static const unsigned RecursionMaxDepth = 12;
77
78 /// \brief Predicate for the element types that the SLP vectorizer supports.
79 ///
80 /// The most important thing to filter here are types which are invalid in LLVM
81 /// vectors. We also filter target specific types which have absolutely no
82 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
83 /// avoids spending time checking the cost model and realizing that they will
84 /// be inevitably scalarized.
isValidElementType(Type * Ty)85 static bool isValidElementType(Type *Ty) {
86 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
87 !Ty->isPPC_FP128Ty();
88 }
89
90 /// \returns the parent basic block if all of the instructions in \p VL
91 /// are in the same block or null otherwise.
getSameBlock(ArrayRef<Value * > VL)92 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
93 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
94 if (!I0)
95 return nullptr;
96 BasicBlock *BB = I0->getParent();
97 for (int i = 1, e = VL.size(); i < e; i++) {
98 Instruction *I = dyn_cast<Instruction>(VL[i]);
99 if (!I)
100 return nullptr;
101
102 if (BB != I->getParent())
103 return nullptr;
104 }
105 return BB;
106 }
107
108 /// \returns True if all of the values in \p VL are constants.
allConstant(ArrayRef<Value * > VL)109 static bool allConstant(ArrayRef<Value *> VL) {
110 for (unsigned i = 0, e = VL.size(); i < e; ++i)
111 if (!isa<Constant>(VL[i]))
112 return false;
113 return true;
114 }
115
116 /// \returns True if all of the values in \p VL are identical.
isSplat(ArrayRef<Value * > VL)117 static bool isSplat(ArrayRef<Value *> VL) {
118 for (unsigned i = 1, e = VL.size(); i < e; ++i)
119 if (VL[i] != VL[0])
120 return false;
121 return true;
122 }
123
124 ///\returns Opcode that can be clubbed with \p Op to create an alternate
125 /// sequence which can later be merged as a ShuffleVector instruction.
getAltOpcode(unsigned Op)126 static unsigned getAltOpcode(unsigned Op) {
127 switch (Op) {
128 case Instruction::FAdd:
129 return Instruction::FSub;
130 case Instruction::FSub:
131 return Instruction::FAdd;
132 case Instruction::Add:
133 return Instruction::Sub;
134 case Instruction::Sub:
135 return Instruction::Add;
136 default:
137 return 0;
138 }
139 }
140
141 ///\returns bool representing if Opcode \p Op can be part
142 /// of an alternate sequence which can later be merged as
143 /// a ShuffleVector instruction.
canCombineAsAltInst(unsigned Op)144 static bool canCombineAsAltInst(unsigned Op) {
145 if (Op == Instruction::FAdd || Op == Instruction::FSub ||
146 Op == Instruction::Sub || Op == Instruction::Add)
147 return true;
148 return false;
149 }
150
151 /// \returns ShuffleVector instruction if intructions in \p VL have
152 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
153 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
isAltInst(ArrayRef<Value * > VL)154 static unsigned isAltInst(ArrayRef<Value *> VL) {
155 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
156 unsigned Opcode = I0->getOpcode();
157 unsigned AltOpcode = getAltOpcode(Opcode);
158 for (int i = 1, e = VL.size(); i < e; i++) {
159 Instruction *I = dyn_cast<Instruction>(VL[i]);
160 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
161 return 0;
162 }
163 return Instruction::ShuffleVector;
164 }
165
166 /// \returns The opcode if all of the Instructions in \p VL have the same
167 /// opcode, or zero.
getSameOpcode(ArrayRef<Value * > VL)168 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
169 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
170 if (!I0)
171 return 0;
172 unsigned Opcode = I0->getOpcode();
173 for (int i = 1, e = VL.size(); i < e; i++) {
174 Instruction *I = dyn_cast<Instruction>(VL[i]);
175 if (!I || Opcode != I->getOpcode()) {
176 if (canCombineAsAltInst(Opcode) && i == 1)
177 return isAltInst(VL);
178 return 0;
179 }
180 }
181 return Opcode;
182 }
183
184 /// Get the intersection (logical and) of all of the potential IR flags
185 /// of each scalar operation (VL) that will be converted into a vector (I).
186 /// Flag set: NSW, NUW, exact, and all of fast-math.
propagateIRFlags(Value * I,ArrayRef<Value * > VL)187 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
188 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
189 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
190 // Intersection is initialized to the 0th scalar,
191 // so start counting from index '1'.
192 for (int i = 1, e = VL.size(); i < e; ++i) {
193 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
194 Intersection->andIRFlags(Scalar);
195 }
196 VecOp->copyIRFlags(Intersection);
197 }
198 }
199 }
200
201 /// \returns \p I after propagating metadata from \p VL.
propagateMetadata(Instruction * I,ArrayRef<Value * > VL)202 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
203 Instruction *I0 = cast<Instruction>(VL[0]);
204 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
205 I0->getAllMetadataOtherThanDebugLoc(Metadata);
206
207 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
208 unsigned Kind = Metadata[i].first;
209 MDNode *MD = Metadata[i].second;
210
211 for (int i = 1, e = VL.size(); MD && i != e; i++) {
212 Instruction *I = cast<Instruction>(VL[i]);
213 MDNode *IMD = I->getMetadata(Kind);
214
215 switch (Kind) {
216 default:
217 MD = nullptr; // Remove unknown metadata
218 break;
219 case LLVMContext::MD_tbaa:
220 MD = MDNode::getMostGenericTBAA(MD, IMD);
221 break;
222 case LLVMContext::MD_alias_scope:
223 MD = MDNode::getMostGenericAliasScope(MD, IMD);
224 break;
225 case LLVMContext::MD_noalias:
226 MD = MDNode::intersect(MD, IMD);
227 break;
228 case LLVMContext::MD_fpmath:
229 MD = MDNode::getMostGenericFPMath(MD, IMD);
230 break;
231 }
232 }
233 I->setMetadata(Kind, MD);
234 }
235 return I;
236 }
237
238 /// \returns The type that all of the values in \p VL have or null if there
239 /// are different types.
getSameType(ArrayRef<Value * > VL)240 static Type* getSameType(ArrayRef<Value *> VL) {
241 Type *Ty = VL[0]->getType();
242 for (int i = 1, e = VL.size(); i < e; i++)
243 if (VL[i]->getType() != Ty)
244 return nullptr;
245
246 return Ty;
247 }
248
249 /// \returns True if the ExtractElement instructions in VL can be vectorized
250 /// to use the original vector.
CanReuseExtract(ArrayRef<Value * > VL)251 static bool CanReuseExtract(ArrayRef<Value *> VL) {
252 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
253 // Check if all of the extracts come from the same vector and from the
254 // correct offset.
255 Value *VL0 = VL[0];
256 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
257 Value *Vec = E0->getOperand(0);
258
259 // We have to extract from the same vector type.
260 unsigned NElts = Vec->getType()->getVectorNumElements();
261
262 if (NElts != VL.size())
263 return false;
264
265 // Check that all of the indices extract from the correct offset.
266 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
267 if (!CI || CI->getZExtValue())
268 return false;
269
270 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
271 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
272 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
273
274 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
275 return false;
276 }
277
278 return true;
279 }
280
reorderInputsAccordingToOpcode(ArrayRef<Value * > VL,SmallVectorImpl<Value * > & Left,SmallVectorImpl<Value * > & Right)281 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
282 SmallVectorImpl<Value *> &Left,
283 SmallVectorImpl<Value *> &Right) {
284
285 SmallVector<Value *, 16> OrigLeft, OrigRight;
286
287 bool AllSameOpcodeLeft = true;
288 bool AllSameOpcodeRight = true;
289 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
290 Instruction *I = cast<Instruction>(VL[i]);
291 Value *V0 = I->getOperand(0);
292 Value *V1 = I->getOperand(1);
293
294 OrigLeft.push_back(V0);
295 OrigRight.push_back(V1);
296
297 Instruction *I0 = dyn_cast<Instruction>(V0);
298 Instruction *I1 = dyn_cast<Instruction>(V1);
299
300 // Check whether all operands on one side have the same opcode. In this case
301 // we want to preserve the original order and not make things worse by
302 // reordering.
303 AllSameOpcodeLeft = I0;
304 AllSameOpcodeRight = I1;
305
306 if (i && AllSameOpcodeLeft) {
307 if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
308 if(P0->getOpcode() != I0->getOpcode())
309 AllSameOpcodeLeft = false;
310 } else
311 AllSameOpcodeLeft = false;
312 }
313 if (i && AllSameOpcodeRight) {
314 if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
315 if(P1->getOpcode() != I1->getOpcode())
316 AllSameOpcodeRight = false;
317 } else
318 AllSameOpcodeRight = false;
319 }
320
321 // Sort two opcodes. In the code below we try to preserve the ability to use
322 // broadcast of values instead of individual inserts.
323 // vl1 = load
324 // vl2 = phi
325 // vr1 = load
326 // vr2 = vr2
327 // = vl1 x vr1
328 // = vl2 x vr2
329 // If we just sorted according to opcode we would leave the first line in
330 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
331 // = vl1 x vr1
332 // = vr2 x vl2
333 // Because vr2 and vr1 are from the same load we loose the opportunity of a
334 // broadcast for the packed right side in the backend: we have [vr1, vl2]
335 // instead of [vr1, vr2=vr1].
336 if (I0 && I1) {
337 if(!i && I0->getOpcode() > I1->getOpcode()) {
338 Left.push_back(I1);
339 Right.push_back(I0);
340 } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
341 // Try not to destroy a broad cast for no apparent benefit.
342 Left.push_back(I1);
343 Right.push_back(I0);
344 } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) {
345 // Try preserve broadcasts.
346 Left.push_back(I1);
347 Right.push_back(I0);
348 } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
349 // Try preserve broadcasts.
350 Left.push_back(I1);
351 Right.push_back(I0);
352 } else {
353 Left.push_back(I0);
354 Right.push_back(I1);
355 }
356 continue;
357 }
358 // One opcode, put the instruction on the right.
359 if (I0) {
360 Left.push_back(V1);
361 Right.push_back(I0);
362 continue;
363 }
364 Left.push_back(V0);
365 Right.push_back(V1);
366 }
367
368 bool LeftBroadcast = isSplat(Left);
369 bool RightBroadcast = isSplat(Right);
370
371 // Don't reorder if the operands where good to begin with.
372 if (!(LeftBroadcast || RightBroadcast) &&
373 (AllSameOpcodeRight || AllSameOpcodeLeft)) {
374 Left = OrigLeft;
375 Right = OrigRight;
376 }
377 }
378
379 /// \returns True if in-tree use also needs extract. This refers to
380 /// possible scalar operand in vectorized instruction.
InTreeUserNeedToExtract(Value * Scalar,Instruction * UserInst,TargetLibraryInfo * TLI)381 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
382 TargetLibraryInfo *TLI) {
383
384 unsigned Opcode = UserInst->getOpcode();
385 switch (Opcode) {
386 case Instruction::Load: {
387 LoadInst *LI = cast<LoadInst>(UserInst);
388 return (LI->getPointerOperand() == Scalar);
389 }
390 case Instruction::Store: {
391 StoreInst *SI = cast<StoreInst>(UserInst);
392 return (SI->getPointerOperand() == Scalar);
393 }
394 case Instruction::Call: {
395 CallInst *CI = cast<CallInst>(UserInst);
396 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
397 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
398 return (CI->getArgOperand(1) == Scalar);
399 }
400 }
401 default:
402 return false;
403 }
404 }
405
406 /// \returns the AA location that is being access by the instruction.
getLocation(Instruction * I,AliasAnalysis * AA)407 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
408 if (StoreInst *SI = dyn_cast<StoreInst>(I))
409 return AA->getLocation(SI);
410 if (LoadInst *LI = dyn_cast<LoadInst>(I))
411 return AA->getLocation(LI);
412 return AliasAnalysis::Location();
413 }
414
415 /// Bottom Up SLP Vectorizer.
416 class BoUpSLP {
417 public:
418 typedef SmallVector<Value *, 8> ValueList;
419 typedef SmallVector<Instruction *, 16> InstrList;
420 typedef SmallPtrSet<Value *, 16> ValueSet;
421 typedef SmallVector<StoreInst *, 8> StoreList;
422
BoUpSLP(Function * Func,ScalarEvolution * Se,const DataLayout * Dl,TargetTransformInfo * Tti,TargetLibraryInfo * TLi,AliasAnalysis * Aa,LoopInfo * Li,DominatorTree * Dt,AssumptionCache * AC)423 BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
424 TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
425 LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC)
426 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
427 SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
428 Builder(Se->getContext()) {
429 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
430 }
431
432 /// \brief Vectorize the tree that starts with the elements in \p VL.
433 /// Returns the vectorized root.
434 Value *vectorizeTree();
435
436 /// \returns the cost incurred by unwanted spills and fills, caused by
437 /// holding live values over call sites.
438 int getSpillCost();
439
440 /// \returns the vectorization cost of the subtree that starts at \p VL.
441 /// A negative number means that this is profitable.
442 int getTreeCost();
443
444 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
445 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
446 void buildTree(ArrayRef<Value *> Roots,
447 ArrayRef<Value *> UserIgnoreLst = None);
448
449 /// Clear the internal data structures that are created by 'buildTree'.
deleteTree()450 void deleteTree() {
451 VectorizableTree.clear();
452 ScalarToTreeEntry.clear();
453 MustGather.clear();
454 ExternalUses.clear();
455 NumLoadsWantToKeepOrder = 0;
456 NumLoadsWantToChangeOrder = 0;
457 for (auto &Iter : BlocksSchedules) {
458 BlockScheduling *BS = Iter.second.get();
459 BS->clear();
460 }
461 }
462
463 /// \returns true if the memory operations A and B are consecutive.
464 bool isConsecutiveAccess(Value *A, Value *B);
465
466 /// \brief Perform LICM and CSE on the newly generated gather sequences.
467 void optimizeGatherSequence();
468
469 /// \returns true if it is benefitial to reverse the vector order.
shouldReorder() const470 bool shouldReorder() const {
471 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
472 }
473
474 private:
475 struct TreeEntry;
476
477 /// \returns the cost of the vectorizable entry.
478 int getEntryCost(TreeEntry *E);
479
480 /// This is the recursive part of buildTree.
481 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
482
483 /// Vectorize a single entry in the tree.
484 Value *vectorizeTree(TreeEntry *E);
485
486 /// Vectorize a single entry in the tree, starting in \p VL.
487 Value *vectorizeTree(ArrayRef<Value *> VL);
488
489 /// \returns the pointer to the vectorized value if \p VL is already
490 /// vectorized, or NULL. They may happen in cycles.
491 Value *alreadyVectorized(ArrayRef<Value *> VL) const;
492
493 /// \brief Take the pointer operand from the Load/Store instruction.
494 /// \returns NULL if this is not a valid Load/Store instruction.
495 static Value *getPointerOperand(Value *I);
496
497 /// \brief Take the address space operand from the Load/Store instruction.
498 /// \returns -1 if this is not a valid Load/Store instruction.
499 static unsigned getAddressSpaceOperand(Value *I);
500
501 /// \returns the scalarization cost for this type. Scalarization in this
502 /// context means the creation of vectors from a group of scalars.
503 int getGatherCost(Type *Ty);
504
505 /// \returns the scalarization cost for this list of values. Assuming that
506 /// this subtree gets vectorized, we may need to extract the values from the
507 /// roots. This method calculates the cost of extracting the values.
508 int getGatherCost(ArrayRef<Value *> VL);
509
510 /// \brief Set the Builder insert point to one after the last instruction in
511 /// the bundle
512 void setInsertPointAfterBundle(ArrayRef<Value *> VL);
513
514 /// \returns a vector from a collection of scalars in \p VL.
515 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
516
517 /// \returns whether the VectorizableTree is fully vectoriable and will
518 /// be beneficial even the tree height is tiny.
519 bool isFullyVectorizableTinyTree();
520
521 struct TreeEntry {
TreeEntry__anon6868a6e70111::BoUpSLP::TreeEntry522 TreeEntry() : Scalars(), VectorizedValue(nullptr),
523 NeedToGather(0) {}
524
525 /// \returns true if the scalars in VL are equal to this entry.
isSame__anon6868a6e70111::BoUpSLP::TreeEntry526 bool isSame(ArrayRef<Value *> VL) const {
527 assert(VL.size() == Scalars.size() && "Invalid size");
528 return std::equal(VL.begin(), VL.end(), Scalars.begin());
529 }
530
531 /// A vector of scalars.
532 ValueList Scalars;
533
534 /// The Scalars are vectorized into this value. It is initialized to Null.
535 Value *VectorizedValue;
536
537 /// Do we need to gather this sequence ?
538 bool NeedToGather;
539 };
540
541 /// Create a new VectorizableTree entry.
newTreeEntry(ArrayRef<Value * > VL,bool Vectorized)542 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
543 VectorizableTree.push_back(TreeEntry());
544 int idx = VectorizableTree.size() - 1;
545 TreeEntry *Last = &VectorizableTree[idx];
546 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
547 Last->NeedToGather = !Vectorized;
548 if (Vectorized) {
549 for (int i = 0, e = VL.size(); i != e; ++i) {
550 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
551 ScalarToTreeEntry[VL[i]] = idx;
552 }
553 } else {
554 MustGather.insert(VL.begin(), VL.end());
555 }
556 return Last;
557 }
558
559 /// -- Vectorization State --
560 /// Holds all of the tree entries.
561 std::vector<TreeEntry> VectorizableTree;
562
563 /// Maps a specific scalar to its tree entry.
564 SmallDenseMap<Value*, int> ScalarToTreeEntry;
565
566 /// A list of scalars that we found that we need to keep as scalars.
567 ValueSet MustGather;
568
569 /// This POD struct describes one external user in the vectorized tree.
570 struct ExternalUser {
ExternalUser__anon6868a6e70111::BoUpSLP::ExternalUser571 ExternalUser (Value *S, llvm::User *U, int L) :
572 Scalar(S), User(U), Lane(L){};
573 // Which scalar in our function.
574 Value *Scalar;
575 // Which user that uses the scalar.
576 llvm::User *User;
577 // Which lane does the scalar belong to.
578 int Lane;
579 };
580 typedef SmallVector<ExternalUser, 16> UserList;
581
582 /// Checks if two instructions may access the same memory.
583 ///
584 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
585 /// is invariant in the calling loop.
isAliased(const AliasAnalysis::Location & Loc1,Instruction * Inst1,Instruction * Inst2)586 bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1,
587 Instruction *Inst2) {
588
589 // First check if the result is already in the cache.
590 AliasCacheKey key = std::make_pair(Inst1, Inst2);
591 Optional<bool> &result = AliasCache[key];
592 if (result.hasValue()) {
593 return result.getValue();
594 }
595 AliasAnalysis::Location Loc2 = getLocation(Inst2, AA);
596 bool aliased = true;
597 if (Loc1.Ptr && Loc2.Ptr) {
598 // Do the alias check.
599 aliased = AA->alias(Loc1, Loc2);
600 }
601 // Store the result in the cache.
602 result = aliased;
603 return aliased;
604 }
605
606 typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
607
608 /// Cache for alias results.
609 /// TODO: consider moving this to the AliasAnalysis itself.
610 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
611
612 /// Removes an instruction from its block and eventually deletes it.
613 /// It's like Instruction::eraseFromParent() except that the actual deletion
614 /// is delayed until BoUpSLP is destructed.
615 /// This is required to ensure that there are no incorrect collisions in the
616 /// AliasCache, which can happen if a new instruction is allocated at the
617 /// same address as a previously deleted instruction.
eraseInstruction(Instruction * I)618 void eraseInstruction(Instruction *I) {
619 I->removeFromParent();
620 I->dropAllReferences();
621 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
622 }
623
624 /// Temporary store for deleted instructions. Instructions will be deleted
625 /// eventually when the BoUpSLP is destructed.
626 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
627
628 /// A list of values that need to extracted out of the tree.
629 /// This list holds pairs of (Internal Scalar : External User).
630 UserList ExternalUses;
631
632 /// Values used only by @llvm.assume calls.
633 SmallPtrSet<const Value *, 32> EphValues;
634
635 /// Holds all of the instructions that we gathered.
636 SetVector<Instruction *> GatherSeq;
637 /// A list of blocks that we are going to CSE.
638 SetVector<BasicBlock *> CSEBlocks;
639
640 /// Contains all scheduling relevant data for an instruction.
641 /// A ScheduleData either represents a single instruction or a member of an
642 /// instruction bundle (= a group of instructions which is combined into a
643 /// vector instruction).
644 struct ScheduleData {
645
646 // The initial value for the dependency counters. It means that the
647 // dependencies are not calculated yet.
648 enum { InvalidDeps = -1 };
649
ScheduleData__anon6868a6e70111::BoUpSLP::ScheduleData650 ScheduleData()
651 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
652 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
653 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
654 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
655
init__anon6868a6e70111::BoUpSLP::ScheduleData656 void init(int BlockSchedulingRegionID) {
657 FirstInBundle = this;
658 NextInBundle = nullptr;
659 NextLoadStore = nullptr;
660 IsScheduled = false;
661 SchedulingRegionID = BlockSchedulingRegionID;
662 UnscheduledDepsInBundle = UnscheduledDeps;
663 clearDependencies();
664 }
665
666 /// Returns true if the dependency information has been calculated.
hasValidDependencies__anon6868a6e70111::BoUpSLP::ScheduleData667 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
668
669 /// Returns true for single instructions and for bundle representatives
670 /// (= the head of a bundle).
isSchedulingEntity__anon6868a6e70111::BoUpSLP::ScheduleData671 bool isSchedulingEntity() const { return FirstInBundle == this; }
672
673 /// Returns true if it represents an instruction bundle and not only a
674 /// single instruction.
isPartOfBundle__anon6868a6e70111::BoUpSLP::ScheduleData675 bool isPartOfBundle() const {
676 return NextInBundle != nullptr || FirstInBundle != this;
677 }
678
679 /// Returns true if it is ready for scheduling, i.e. it has no more
680 /// unscheduled depending instructions/bundles.
isReady__anon6868a6e70111::BoUpSLP::ScheduleData681 bool isReady() const {
682 assert(isSchedulingEntity() &&
683 "can't consider non-scheduling entity for ready list");
684 return UnscheduledDepsInBundle == 0 && !IsScheduled;
685 }
686
687 /// Modifies the number of unscheduled dependencies, also updating it for
688 /// the whole bundle.
incrementUnscheduledDeps__anon6868a6e70111::BoUpSLP::ScheduleData689 int incrementUnscheduledDeps(int Incr) {
690 UnscheduledDeps += Incr;
691 return FirstInBundle->UnscheduledDepsInBundle += Incr;
692 }
693
694 /// Sets the number of unscheduled dependencies to the number of
695 /// dependencies.
resetUnscheduledDeps__anon6868a6e70111::BoUpSLP::ScheduleData696 void resetUnscheduledDeps() {
697 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
698 }
699
700 /// Clears all dependency information.
clearDependencies__anon6868a6e70111::BoUpSLP::ScheduleData701 void clearDependencies() {
702 Dependencies = InvalidDeps;
703 resetUnscheduledDeps();
704 MemoryDependencies.clear();
705 }
706
dump__anon6868a6e70111::BoUpSLP::ScheduleData707 void dump(raw_ostream &os) const {
708 if (!isSchedulingEntity()) {
709 os << "/ " << *Inst;
710 } else if (NextInBundle) {
711 os << '[' << *Inst;
712 ScheduleData *SD = NextInBundle;
713 while (SD) {
714 os << ';' << *SD->Inst;
715 SD = SD->NextInBundle;
716 }
717 os << ']';
718 } else {
719 os << *Inst;
720 }
721 }
722
723 Instruction *Inst;
724
725 /// Points to the head in an instruction bundle (and always to this for
726 /// single instructions).
727 ScheduleData *FirstInBundle;
728
729 /// Single linked list of all instructions in a bundle. Null if it is a
730 /// single instruction.
731 ScheduleData *NextInBundle;
732
733 /// Single linked list of all memory instructions (e.g. load, store, call)
734 /// in the block - until the end of the scheduling region.
735 ScheduleData *NextLoadStore;
736
737 /// The dependent memory instructions.
738 /// This list is derived on demand in calculateDependencies().
739 SmallVector<ScheduleData *, 4> MemoryDependencies;
740
741 /// This ScheduleData is in the current scheduling region if this matches
742 /// the current SchedulingRegionID of BlockScheduling.
743 int SchedulingRegionID;
744
745 /// Used for getting a "good" final ordering of instructions.
746 int SchedulingPriority;
747
748 /// The number of dependencies. Constitutes of the number of users of the
749 /// instruction plus the number of dependent memory instructions (if any).
750 /// This value is calculated on demand.
751 /// If InvalidDeps, the number of dependencies is not calculated yet.
752 ///
753 int Dependencies;
754
755 /// The number of dependencies minus the number of dependencies of scheduled
756 /// instructions. As soon as this is zero, the instruction/bundle gets ready
757 /// for scheduling.
758 /// Note that this is negative as long as Dependencies is not calculated.
759 int UnscheduledDeps;
760
761 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
762 /// single instructions.
763 int UnscheduledDepsInBundle;
764
765 /// True if this instruction is scheduled (or considered as scheduled in the
766 /// dry-run).
767 bool IsScheduled;
768 };
769
770 #ifndef NDEBUG
771 friend raw_ostream &operator<<(raw_ostream &os,
772 const BoUpSLP::ScheduleData &SD);
773 #endif
774
775 /// Contains all scheduling data for a basic block.
776 ///
777 struct BlockScheduling {
778
BlockScheduling__anon6868a6e70111::BoUpSLP::BlockScheduling779 BlockScheduling(BasicBlock *BB)
780 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
781 ScheduleStart(nullptr), ScheduleEnd(nullptr),
782 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
783 // Make sure that the initial SchedulingRegionID is greater than the
784 // initial SchedulingRegionID in ScheduleData (which is 0).
785 SchedulingRegionID(1) {}
786
clear__anon6868a6e70111::BoUpSLP::BlockScheduling787 void clear() {
788 ReadyInsts.clear();
789 ScheduleStart = nullptr;
790 ScheduleEnd = nullptr;
791 FirstLoadStoreInRegion = nullptr;
792 LastLoadStoreInRegion = nullptr;
793
794 // Make a new scheduling region, i.e. all existing ScheduleData is not
795 // in the new region yet.
796 ++SchedulingRegionID;
797 }
798
getScheduleData__anon6868a6e70111::BoUpSLP::BlockScheduling799 ScheduleData *getScheduleData(Value *V) {
800 ScheduleData *SD = ScheduleDataMap[V];
801 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
802 return SD;
803 return nullptr;
804 }
805
isInSchedulingRegion__anon6868a6e70111::BoUpSLP::BlockScheduling806 bool isInSchedulingRegion(ScheduleData *SD) {
807 return SD->SchedulingRegionID == SchedulingRegionID;
808 }
809
810 /// Marks an instruction as scheduled and puts all dependent ready
811 /// instructions into the ready-list.
812 template <typename ReadyListType>
schedule__anon6868a6e70111::BoUpSLP::BlockScheduling813 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
814 SD->IsScheduled = true;
815 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
816
817 ScheduleData *BundleMember = SD;
818 while (BundleMember) {
819 // Handle the def-use chain dependencies.
820 for (Use &U : BundleMember->Inst->operands()) {
821 ScheduleData *OpDef = getScheduleData(U.get());
822 if (OpDef && OpDef->hasValidDependencies() &&
823 OpDef->incrementUnscheduledDeps(-1) == 0) {
824 // There are no more unscheduled dependencies after decrementing,
825 // so we can put the dependent instruction into the ready list.
826 ScheduleData *DepBundle = OpDef->FirstInBundle;
827 assert(!DepBundle->IsScheduled &&
828 "already scheduled bundle gets ready");
829 ReadyList.insert(DepBundle);
830 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
831 }
832 }
833 // Handle the memory dependencies.
834 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
835 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
836 // There are no more unscheduled dependencies after decrementing,
837 // so we can put the dependent instruction into the ready list.
838 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
839 assert(!DepBundle->IsScheduled &&
840 "already scheduled bundle gets ready");
841 ReadyList.insert(DepBundle);
842 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
843 }
844 }
845 BundleMember = BundleMember->NextInBundle;
846 }
847 }
848
849 /// Put all instructions into the ReadyList which are ready for scheduling.
850 template <typename ReadyListType>
initialFillReadyList__anon6868a6e70111::BoUpSLP::BlockScheduling851 void initialFillReadyList(ReadyListType &ReadyList) {
852 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
853 ScheduleData *SD = getScheduleData(I);
854 if (SD->isSchedulingEntity() && SD->isReady()) {
855 ReadyList.insert(SD);
856 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
857 }
858 }
859 }
860
861 /// Checks if a bundle of instructions can be scheduled, i.e. has no
862 /// cyclic dependencies. This is only a dry-run, no instructions are
863 /// actually moved at this stage.
864 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
865
866 /// Un-bundles a group of instructions.
867 void cancelScheduling(ArrayRef<Value *> VL);
868
869 /// Extends the scheduling region so that V is inside the region.
870 void extendSchedulingRegion(Value *V);
871
872 /// Initialize the ScheduleData structures for new instructions in the
873 /// scheduling region.
874 void initScheduleData(Instruction *FromI, Instruction *ToI,
875 ScheduleData *PrevLoadStore,
876 ScheduleData *NextLoadStore);
877
878 /// Updates the dependency information of a bundle and of all instructions/
879 /// bundles which depend on the original bundle.
880 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
881 BoUpSLP *SLP);
882
883 /// Sets all instruction in the scheduling region to un-scheduled.
884 void resetSchedule();
885
886 BasicBlock *BB;
887
888 /// Simple memory allocation for ScheduleData.
889 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
890
891 /// The size of a ScheduleData array in ScheduleDataChunks.
892 int ChunkSize;
893
894 /// The allocator position in the current chunk, which is the last entry
895 /// of ScheduleDataChunks.
896 int ChunkPos;
897
898 /// Attaches ScheduleData to Instruction.
899 /// Note that the mapping survives during all vectorization iterations, i.e.
900 /// ScheduleData structures are recycled.
901 DenseMap<Value *, ScheduleData *> ScheduleDataMap;
902
903 struct ReadyList : SmallVector<ScheduleData *, 8> {
insert__anon6868a6e70111::BoUpSLP::BlockScheduling::ReadyList904 void insert(ScheduleData *SD) { push_back(SD); }
905 };
906
907 /// The ready-list for scheduling (only used for the dry-run).
908 ReadyList ReadyInsts;
909
910 /// The first instruction of the scheduling region.
911 Instruction *ScheduleStart;
912
913 /// The first instruction _after_ the scheduling region.
914 Instruction *ScheduleEnd;
915
916 /// The first memory accessing instruction in the scheduling region
917 /// (can be null).
918 ScheduleData *FirstLoadStoreInRegion;
919
920 /// The last memory accessing instruction in the scheduling region
921 /// (can be null).
922 ScheduleData *LastLoadStoreInRegion;
923
924 /// The ID of the scheduling region. For a new vectorization iteration this
925 /// is incremented which "removes" all ScheduleData from the region.
926 int SchedulingRegionID;
927 };
928
929 /// Attaches the BlockScheduling structures to basic blocks.
930 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
931
932 /// Performs the "real" scheduling. Done before vectorization is actually
933 /// performed in a basic block.
934 void scheduleBlock(BlockScheduling *BS);
935
936 /// List of users to ignore during scheduling and that don't need extracting.
937 ArrayRef<Value *> UserIgnoreList;
938
939 // Number of load-bundles, which contain consecutive loads.
940 int NumLoadsWantToKeepOrder;
941
942 // Number of load-bundles of size 2, which are consecutive loads if reversed.
943 int NumLoadsWantToChangeOrder;
944
945 // Analysis and block reference.
946 Function *F;
947 ScalarEvolution *SE;
948 const DataLayout *DL;
949 TargetTransformInfo *TTI;
950 TargetLibraryInfo *TLI;
951 AliasAnalysis *AA;
952 LoopInfo *LI;
953 DominatorTree *DT;
954 /// Instruction builder to construct the vectorized tree.
955 IRBuilder<> Builder;
956 };
957
958 #ifndef NDEBUG
operator <<(raw_ostream & os,const BoUpSLP::ScheduleData & SD)959 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
960 SD.dump(os);
961 return os;
962 }
963 #endif
964
buildTree(ArrayRef<Value * > Roots,ArrayRef<Value * > UserIgnoreLst)965 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
966 ArrayRef<Value *> UserIgnoreLst) {
967 deleteTree();
968 UserIgnoreList = UserIgnoreLst;
969 if (!getSameType(Roots))
970 return;
971 buildTree_rec(Roots, 0);
972
973 // Collect the values that we need to extract from the tree.
974 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
975 TreeEntry *Entry = &VectorizableTree[EIdx];
976
977 // For each lane:
978 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
979 Value *Scalar = Entry->Scalars[Lane];
980
981 // No need to handle users of gathered values.
982 if (Entry->NeedToGather)
983 continue;
984
985 for (User *U : Scalar->users()) {
986 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
987
988 Instruction *UserInst = dyn_cast<Instruction>(U);
989 if (!UserInst)
990 continue;
991
992 // Skip in-tree scalars that become vectors
993 if (ScalarToTreeEntry.count(U)) {
994 int Idx = ScalarToTreeEntry[U];
995 TreeEntry *UseEntry = &VectorizableTree[Idx];
996 Value *UseScalar = UseEntry->Scalars[0];
997 // Some in-tree scalars will remain as scalar in vectorized
998 // instructions. If that is the case, the one in Lane 0 will
999 // be used.
1000 if (UseScalar != U ||
1001 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
1002 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
1003 << ".\n");
1004 assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
1005 continue;
1006 }
1007 }
1008
1009 // Ignore users in the user ignore list.
1010 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
1011 UserIgnoreList.end())
1012 continue;
1013
1014 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
1015 Lane << " from " << *Scalar << ".\n");
1016 ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
1017 }
1018 }
1019 }
1020 }
1021
1022
buildTree_rec(ArrayRef<Value * > VL,unsigned Depth)1023 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
1024 bool SameTy = getSameType(VL); (void)SameTy;
1025 bool isAltShuffle = false;
1026 assert(SameTy && "Invalid types!");
1027
1028 if (Depth == RecursionMaxDepth) {
1029 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
1030 newTreeEntry(VL, false);
1031 return;
1032 }
1033
1034 // Don't handle vectors.
1035 if (VL[0]->getType()->isVectorTy()) {
1036 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
1037 newTreeEntry(VL, false);
1038 return;
1039 }
1040
1041 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1042 if (SI->getValueOperand()->getType()->isVectorTy()) {
1043 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
1044 newTreeEntry(VL, false);
1045 return;
1046 }
1047 unsigned Opcode = getSameOpcode(VL);
1048
1049 // Check that this shuffle vector refers to the alternate
1050 // sequence of opcodes.
1051 if (Opcode == Instruction::ShuffleVector) {
1052 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
1053 unsigned Op = I0->getOpcode();
1054 if (Op != Instruction::ShuffleVector)
1055 isAltShuffle = true;
1056 }
1057
1058 // If all of the operands are identical or constant we have a simple solution.
1059 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
1060 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
1061 newTreeEntry(VL, false);
1062 return;
1063 }
1064
1065 // We now know that this is a vector of instructions of the same type from
1066 // the same block.
1067
1068 // Don't vectorize ephemeral values.
1069 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1070 if (EphValues.count(VL[i])) {
1071 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1072 ") is ephemeral.\n");
1073 newTreeEntry(VL, false);
1074 return;
1075 }
1076 }
1077
1078 // Check if this is a duplicate of another entry.
1079 if (ScalarToTreeEntry.count(VL[0])) {
1080 int Idx = ScalarToTreeEntry[VL[0]];
1081 TreeEntry *E = &VectorizableTree[Idx];
1082 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1083 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1084 if (E->Scalars[i] != VL[i]) {
1085 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1086 newTreeEntry(VL, false);
1087 return;
1088 }
1089 }
1090 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1091 return;
1092 }
1093
1094 // Check that none of the instructions in the bundle are already in the tree.
1095 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1096 if (ScalarToTreeEntry.count(VL[i])) {
1097 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1098 ") is already in tree.\n");
1099 newTreeEntry(VL, false);
1100 return;
1101 }
1102 }
1103
1104 // If any of the scalars is marked as a value that needs to stay scalar then
1105 // we need to gather the scalars.
1106 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1107 if (MustGather.count(VL[i])) {
1108 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1109 newTreeEntry(VL, false);
1110 return;
1111 }
1112 }
1113
1114 // Check that all of the users of the scalars that we want to vectorize are
1115 // schedulable.
1116 Instruction *VL0 = cast<Instruction>(VL[0]);
1117 BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1118
1119 if (!DT->isReachableFromEntry(BB)) {
1120 // Don't go into unreachable blocks. They may contain instructions with
1121 // dependency cycles which confuse the final scheduling.
1122 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1123 newTreeEntry(VL, false);
1124 return;
1125 }
1126
1127 // Check that every instructions appears once in this bundle.
1128 for (unsigned i = 0, e = VL.size(); i < e; ++i)
1129 for (unsigned j = i+1; j < e; ++j)
1130 if (VL[i] == VL[j]) {
1131 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1132 newTreeEntry(VL, false);
1133 return;
1134 }
1135
1136 auto &BSRef = BlocksSchedules[BB];
1137 if (!BSRef) {
1138 BSRef = llvm::make_unique<BlockScheduling>(BB);
1139 }
1140 BlockScheduling &BS = *BSRef.get();
1141
1142 if (!BS.tryScheduleBundle(VL, this)) {
1143 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1144 BS.cancelScheduling(VL);
1145 newTreeEntry(VL, false);
1146 return;
1147 }
1148 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1149
1150 switch (Opcode) {
1151 case Instruction::PHI: {
1152 PHINode *PH = dyn_cast<PHINode>(VL0);
1153
1154 // Check for terminator values (e.g. invoke).
1155 for (unsigned j = 0; j < VL.size(); ++j)
1156 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1157 TerminatorInst *Term = dyn_cast<TerminatorInst>(
1158 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1159 if (Term) {
1160 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1161 BS.cancelScheduling(VL);
1162 newTreeEntry(VL, false);
1163 return;
1164 }
1165 }
1166
1167 newTreeEntry(VL, true);
1168 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1169
1170 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1171 ValueList Operands;
1172 // Prepare the operand vector.
1173 for (unsigned j = 0; j < VL.size(); ++j)
1174 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1175 PH->getIncomingBlock(i)));
1176
1177 buildTree_rec(Operands, Depth + 1);
1178 }
1179 return;
1180 }
1181 case Instruction::ExtractElement: {
1182 bool Reuse = CanReuseExtract(VL);
1183 if (Reuse) {
1184 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1185 } else {
1186 BS.cancelScheduling(VL);
1187 }
1188 newTreeEntry(VL, Reuse);
1189 return;
1190 }
1191 case Instruction::Load: {
1192 // Check if the loads are consecutive or of we need to swizzle them.
1193 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1194 LoadInst *L = cast<LoadInst>(VL[i]);
1195 if (!L->isSimple()) {
1196 BS.cancelScheduling(VL);
1197 newTreeEntry(VL, false);
1198 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1199 return;
1200 }
1201 if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
1202 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
1203 ++NumLoadsWantToChangeOrder;
1204 }
1205 BS.cancelScheduling(VL);
1206 newTreeEntry(VL, false);
1207 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1208 return;
1209 }
1210 }
1211 ++NumLoadsWantToKeepOrder;
1212 newTreeEntry(VL, true);
1213 DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1214 return;
1215 }
1216 case Instruction::ZExt:
1217 case Instruction::SExt:
1218 case Instruction::FPToUI:
1219 case Instruction::FPToSI:
1220 case Instruction::FPExt:
1221 case Instruction::PtrToInt:
1222 case Instruction::IntToPtr:
1223 case Instruction::SIToFP:
1224 case Instruction::UIToFP:
1225 case Instruction::Trunc:
1226 case Instruction::FPTrunc:
1227 case Instruction::BitCast: {
1228 Type *SrcTy = VL0->getOperand(0)->getType();
1229 for (unsigned i = 0; i < VL.size(); ++i) {
1230 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1231 if (Ty != SrcTy || !isValidElementType(Ty)) {
1232 BS.cancelScheduling(VL);
1233 newTreeEntry(VL, false);
1234 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1235 return;
1236 }
1237 }
1238 newTreeEntry(VL, true);
1239 DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1240
1241 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1242 ValueList Operands;
1243 // Prepare the operand vector.
1244 for (unsigned j = 0; j < VL.size(); ++j)
1245 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1246
1247 buildTree_rec(Operands, Depth+1);
1248 }
1249 return;
1250 }
1251 case Instruction::ICmp:
1252 case Instruction::FCmp: {
1253 // Check that all of the compares have the same predicate.
1254 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
1255 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1256 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1257 CmpInst *Cmp = cast<CmpInst>(VL[i]);
1258 if (Cmp->getPredicate() != P0 ||
1259 Cmp->getOperand(0)->getType() != ComparedTy) {
1260 BS.cancelScheduling(VL);
1261 newTreeEntry(VL, false);
1262 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1263 return;
1264 }
1265 }
1266
1267 newTreeEntry(VL, true);
1268 DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1269
1270 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1271 ValueList Operands;
1272 // Prepare the operand vector.
1273 for (unsigned j = 0; j < VL.size(); ++j)
1274 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1275
1276 buildTree_rec(Operands, Depth+1);
1277 }
1278 return;
1279 }
1280 case Instruction::Select:
1281 case Instruction::Add:
1282 case Instruction::FAdd:
1283 case Instruction::Sub:
1284 case Instruction::FSub:
1285 case Instruction::Mul:
1286 case Instruction::FMul:
1287 case Instruction::UDiv:
1288 case Instruction::SDiv:
1289 case Instruction::FDiv:
1290 case Instruction::URem:
1291 case Instruction::SRem:
1292 case Instruction::FRem:
1293 case Instruction::Shl:
1294 case Instruction::LShr:
1295 case Instruction::AShr:
1296 case Instruction::And:
1297 case Instruction::Or:
1298 case Instruction::Xor: {
1299 newTreeEntry(VL, true);
1300 DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1301
1302 // Sort operands of the instructions so that each side is more likely to
1303 // have the same opcode.
1304 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1305 ValueList Left, Right;
1306 reorderInputsAccordingToOpcode(VL, Left, Right);
1307 buildTree_rec(Left, Depth + 1);
1308 buildTree_rec(Right, Depth + 1);
1309 return;
1310 }
1311
1312 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1313 ValueList Operands;
1314 // Prepare the operand vector.
1315 for (unsigned j = 0; j < VL.size(); ++j)
1316 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1317
1318 buildTree_rec(Operands, Depth+1);
1319 }
1320 return;
1321 }
1322 case Instruction::GetElementPtr: {
1323 // We don't combine GEPs with complicated (nested) indexing.
1324 for (unsigned j = 0; j < VL.size(); ++j) {
1325 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1326 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1327 BS.cancelScheduling(VL);
1328 newTreeEntry(VL, false);
1329 return;
1330 }
1331 }
1332
1333 // We can't combine several GEPs into one vector if they operate on
1334 // different types.
1335 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1336 for (unsigned j = 0; j < VL.size(); ++j) {
1337 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1338 if (Ty0 != CurTy) {
1339 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1340 BS.cancelScheduling(VL);
1341 newTreeEntry(VL, false);
1342 return;
1343 }
1344 }
1345
1346 // We don't combine GEPs with non-constant indexes.
1347 for (unsigned j = 0; j < VL.size(); ++j) {
1348 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1349 if (!isa<ConstantInt>(Op)) {
1350 DEBUG(
1351 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1352 BS.cancelScheduling(VL);
1353 newTreeEntry(VL, false);
1354 return;
1355 }
1356 }
1357
1358 newTreeEntry(VL, true);
1359 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1360 for (unsigned i = 0, e = 2; i < e; ++i) {
1361 ValueList Operands;
1362 // Prepare the operand vector.
1363 for (unsigned j = 0; j < VL.size(); ++j)
1364 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1365
1366 buildTree_rec(Operands, Depth + 1);
1367 }
1368 return;
1369 }
1370 case Instruction::Store: {
1371 // Check if the stores are consecutive or of we need to swizzle them.
1372 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1373 if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
1374 BS.cancelScheduling(VL);
1375 newTreeEntry(VL, false);
1376 DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1377 return;
1378 }
1379
1380 newTreeEntry(VL, true);
1381 DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1382
1383 ValueList Operands;
1384 for (unsigned j = 0; j < VL.size(); ++j)
1385 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1386
1387 buildTree_rec(Operands, Depth + 1);
1388 return;
1389 }
1390 case Instruction::Call: {
1391 // Check if the calls are all to the same vectorizable intrinsic.
1392 CallInst *CI = cast<CallInst>(VL[0]);
1393 // Check if this is an Intrinsic call or something that can be
1394 // represented by an intrinsic call
1395 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1396 if (!isTriviallyVectorizable(ID)) {
1397 BS.cancelScheduling(VL);
1398 newTreeEntry(VL, false);
1399 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1400 return;
1401 }
1402 Function *Int = CI->getCalledFunction();
1403 Value *A1I = nullptr;
1404 if (hasVectorInstrinsicScalarOpd(ID, 1))
1405 A1I = CI->getArgOperand(1);
1406 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1407 CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1408 if (!CI2 || CI2->getCalledFunction() != Int ||
1409 getIntrinsicIDForCall(CI2, TLI) != ID) {
1410 BS.cancelScheduling(VL);
1411 newTreeEntry(VL, false);
1412 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1413 << "\n");
1414 return;
1415 }
1416 // ctlz,cttz and powi are special intrinsics whose second argument
1417 // should be same in order for them to be vectorized.
1418 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1419 Value *A1J = CI2->getArgOperand(1);
1420 if (A1I != A1J) {
1421 BS.cancelScheduling(VL);
1422 newTreeEntry(VL, false);
1423 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1424 << " argument "<< A1I<<"!=" << A1J
1425 << "\n");
1426 return;
1427 }
1428 }
1429 }
1430
1431 newTreeEntry(VL, true);
1432 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1433 ValueList Operands;
1434 // Prepare the operand vector.
1435 for (unsigned j = 0; j < VL.size(); ++j) {
1436 CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1437 Operands.push_back(CI2->getArgOperand(i));
1438 }
1439 buildTree_rec(Operands, Depth + 1);
1440 }
1441 return;
1442 }
1443 case Instruction::ShuffleVector: {
1444 // If this is not an alternate sequence of opcode like add-sub
1445 // then do not vectorize this instruction.
1446 if (!isAltShuffle) {
1447 BS.cancelScheduling(VL);
1448 newTreeEntry(VL, false);
1449 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1450 return;
1451 }
1452 newTreeEntry(VL, true);
1453 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1454 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1455 ValueList Operands;
1456 // Prepare the operand vector.
1457 for (unsigned j = 0; j < VL.size(); ++j)
1458 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1459
1460 buildTree_rec(Operands, Depth + 1);
1461 }
1462 return;
1463 }
1464 default:
1465 BS.cancelScheduling(VL);
1466 newTreeEntry(VL, false);
1467 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1468 return;
1469 }
1470 }
1471
getEntryCost(TreeEntry * E)1472 int BoUpSLP::getEntryCost(TreeEntry *E) {
1473 ArrayRef<Value*> VL = E->Scalars;
1474
1475 Type *ScalarTy = VL[0]->getType();
1476 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1477 ScalarTy = SI->getValueOperand()->getType();
1478 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1479
1480 if (E->NeedToGather) {
1481 if (allConstant(VL))
1482 return 0;
1483 if (isSplat(VL)) {
1484 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1485 }
1486 return getGatherCost(E->Scalars);
1487 }
1488 unsigned Opcode = getSameOpcode(VL);
1489 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1490 Instruction *VL0 = cast<Instruction>(VL[0]);
1491 switch (Opcode) {
1492 case Instruction::PHI: {
1493 return 0;
1494 }
1495 case Instruction::ExtractElement: {
1496 if (CanReuseExtract(VL)) {
1497 int DeadCost = 0;
1498 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1499 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1500 if (E->hasOneUse())
1501 // Take credit for instruction that will become dead.
1502 DeadCost +=
1503 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1504 }
1505 return -DeadCost;
1506 }
1507 return getGatherCost(VecTy);
1508 }
1509 case Instruction::ZExt:
1510 case Instruction::SExt:
1511 case Instruction::FPToUI:
1512 case Instruction::FPToSI:
1513 case Instruction::FPExt:
1514 case Instruction::PtrToInt:
1515 case Instruction::IntToPtr:
1516 case Instruction::SIToFP:
1517 case Instruction::UIToFP:
1518 case Instruction::Trunc:
1519 case Instruction::FPTrunc:
1520 case Instruction::BitCast: {
1521 Type *SrcTy = VL0->getOperand(0)->getType();
1522
1523 // Calculate the cost of this instruction.
1524 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1525 VL0->getType(), SrcTy);
1526
1527 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1528 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1529 return VecCost - ScalarCost;
1530 }
1531 case Instruction::FCmp:
1532 case Instruction::ICmp:
1533 case Instruction::Select:
1534 case Instruction::Add:
1535 case Instruction::FAdd:
1536 case Instruction::Sub:
1537 case Instruction::FSub:
1538 case Instruction::Mul:
1539 case Instruction::FMul:
1540 case Instruction::UDiv:
1541 case Instruction::SDiv:
1542 case Instruction::FDiv:
1543 case Instruction::URem:
1544 case Instruction::SRem:
1545 case Instruction::FRem:
1546 case Instruction::Shl:
1547 case Instruction::LShr:
1548 case Instruction::AShr:
1549 case Instruction::And:
1550 case Instruction::Or:
1551 case Instruction::Xor: {
1552 // Calculate the cost of this instruction.
1553 int ScalarCost = 0;
1554 int VecCost = 0;
1555 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1556 Opcode == Instruction::Select) {
1557 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1558 ScalarCost = VecTy->getNumElements() *
1559 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1560 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1561 } else {
1562 // Certain instructions can be cheaper to vectorize if they have a
1563 // constant second vector operand.
1564 TargetTransformInfo::OperandValueKind Op1VK =
1565 TargetTransformInfo::OK_AnyValue;
1566 TargetTransformInfo::OperandValueKind Op2VK =
1567 TargetTransformInfo::OK_UniformConstantValue;
1568 TargetTransformInfo::OperandValueProperties Op1VP =
1569 TargetTransformInfo::OP_None;
1570 TargetTransformInfo::OperandValueProperties Op2VP =
1571 TargetTransformInfo::OP_None;
1572
1573 // If all operands are exactly the same ConstantInt then set the
1574 // operand kind to OK_UniformConstantValue.
1575 // If instead not all operands are constants, then set the operand kind
1576 // to OK_AnyValue. If all operands are constants but not the same,
1577 // then set the operand kind to OK_NonUniformConstantValue.
1578 ConstantInt *CInt = nullptr;
1579 for (unsigned i = 0; i < VL.size(); ++i) {
1580 const Instruction *I = cast<Instruction>(VL[i]);
1581 if (!isa<ConstantInt>(I->getOperand(1))) {
1582 Op2VK = TargetTransformInfo::OK_AnyValue;
1583 break;
1584 }
1585 if (i == 0) {
1586 CInt = cast<ConstantInt>(I->getOperand(1));
1587 continue;
1588 }
1589 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1590 CInt != cast<ConstantInt>(I->getOperand(1)))
1591 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1592 }
1593 // FIXME: Currently cost of model modification for division by
1594 // power of 2 is handled only for X86. Add support for other targets.
1595 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1596 CInt->getValue().isPowerOf2())
1597 Op2VP = TargetTransformInfo::OP_PowerOf2;
1598
1599 ScalarCost = VecTy->getNumElements() *
1600 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1601 Op1VP, Op2VP);
1602 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1603 Op1VP, Op2VP);
1604 }
1605 return VecCost - ScalarCost;
1606 }
1607 case Instruction::GetElementPtr: {
1608 TargetTransformInfo::OperandValueKind Op1VK =
1609 TargetTransformInfo::OK_AnyValue;
1610 TargetTransformInfo::OperandValueKind Op2VK =
1611 TargetTransformInfo::OK_UniformConstantValue;
1612
1613 int ScalarCost =
1614 VecTy->getNumElements() *
1615 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1616 int VecCost =
1617 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1618
1619 return VecCost - ScalarCost;
1620 }
1621 case Instruction::Load: {
1622 // Cost of wide load - cost of scalar loads.
1623 int ScalarLdCost = VecTy->getNumElements() *
1624 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1625 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1626 return VecLdCost - ScalarLdCost;
1627 }
1628 case Instruction::Store: {
1629 // We know that we can merge the stores. Calculate the cost.
1630 int ScalarStCost = VecTy->getNumElements() *
1631 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1632 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1633 return VecStCost - ScalarStCost;
1634 }
1635 case Instruction::Call: {
1636 CallInst *CI = cast<CallInst>(VL0);
1637 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1638
1639 // Calculate the cost of the scalar and vector calls.
1640 SmallVector<Type*, 4> ScalarTys, VecTys;
1641 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1642 ScalarTys.push_back(CI->getArgOperand(op)->getType());
1643 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1644 VecTy->getNumElements()));
1645 }
1646
1647 int ScalarCallCost = VecTy->getNumElements() *
1648 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1649
1650 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1651
1652 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1653 << " (" << VecCallCost << "-" << ScalarCallCost << ")"
1654 << " for " << *CI << "\n");
1655
1656 return VecCallCost - ScalarCallCost;
1657 }
1658 case Instruction::ShuffleVector: {
1659 TargetTransformInfo::OperandValueKind Op1VK =
1660 TargetTransformInfo::OK_AnyValue;
1661 TargetTransformInfo::OperandValueKind Op2VK =
1662 TargetTransformInfo::OK_AnyValue;
1663 int ScalarCost = 0;
1664 int VecCost = 0;
1665 for (unsigned i = 0; i < VL.size(); ++i) {
1666 Instruction *I = cast<Instruction>(VL[i]);
1667 if (!I)
1668 break;
1669 ScalarCost +=
1670 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1671 }
1672 // VecCost is equal to sum of the cost of creating 2 vectors
1673 // and the cost of creating shuffle.
1674 Instruction *I0 = cast<Instruction>(VL[0]);
1675 VecCost =
1676 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1677 Instruction *I1 = cast<Instruction>(VL[1]);
1678 VecCost +=
1679 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1680 VecCost +=
1681 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1682 return VecCost - ScalarCost;
1683 }
1684 default:
1685 llvm_unreachable("Unknown instruction");
1686 }
1687 }
1688
isFullyVectorizableTinyTree()1689 bool BoUpSLP::isFullyVectorizableTinyTree() {
1690 DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1691 VectorizableTree.size() << " is fully vectorizable .\n");
1692
1693 // We only handle trees of height 2.
1694 if (VectorizableTree.size() != 2)
1695 return false;
1696
1697 // Handle splat stores.
1698 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
1699 return true;
1700
1701 // Gathering cost would be too much for tiny trees.
1702 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1703 return false;
1704
1705 return true;
1706 }
1707
getSpillCost()1708 int BoUpSLP::getSpillCost() {
1709 // Walk from the bottom of the tree to the top, tracking which values are
1710 // live. When we see a call instruction that is not part of our tree,
1711 // query TTI to see if there is a cost to keeping values live over it
1712 // (for example, if spills and fills are required).
1713 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1714 int Cost = 0;
1715
1716 SmallPtrSet<Instruction*, 4> LiveValues;
1717 Instruction *PrevInst = nullptr;
1718
1719 for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1720 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1721 if (!Inst)
1722 continue;
1723
1724 if (!PrevInst) {
1725 PrevInst = Inst;
1726 continue;
1727 }
1728
1729 DEBUG(
1730 dbgs() << "SLP: #LV: " << LiveValues.size();
1731 for (auto *X : LiveValues)
1732 dbgs() << " " << X->getName();
1733 dbgs() << ", Looking at ";
1734 Inst->dump();
1735 );
1736
1737 // Update LiveValues.
1738 LiveValues.erase(PrevInst);
1739 for (auto &J : PrevInst->operands()) {
1740 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1741 LiveValues.insert(cast<Instruction>(&*J));
1742 }
1743
1744 // Now find the sequence of instructions between PrevInst and Inst.
1745 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
1746 --PrevInstIt;
1747 while (InstIt != PrevInstIt) {
1748 if (PrevInstIt == PrevInst->getParent()->rend()) {
1749 PrevInstIt = Inst->getParent()->rbegin();
1750 continue;
1751 }
1752
1753 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1754 SmallVector<Type*, 4> V;
1755 for (auto *II : LiveValues)
1756 V.push_back(VectorType::get(II->getType(), BundleWidth));
1757 Cost += TTI->getCostOfKeepingLiveOverCall(V);
1758 }
1759
1760 ++PrevInstIt;
1761 }
1762
1763 PrevInst = Inst;
1764 }
1765
1766 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1767 return Cost;
1768 }
1769
getTreeCost()1770 int BoUpSLP::getTreeCost() {
1771 int Cost = 0;
1772 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1773 VectorizableTree.size() << ".\n");
1774
1775 // We only vectorize tiny trees if it is fully vectorizable.
1776 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1777 if (!VectorizableTree.size()) {
1778 assert(!ExternalUses.size() && "We should not have any external users");
1779 }
1780 return INT_MAX;
1781 }
1782
1783 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1784
1785 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1786 int C = getEntryCost(&VectorizableTree[i]);
1787 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1788 << *VectorizableTree[i].Scalars[0] << " .\n");
1789 Cost += C;
1790 }
1791
1792 SmallSet<Value *, 16> ExtractCostCalculated;
1793 int ExtractCost = 0;
1794 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1795 I != E; ++I) {
1796 // We only add extract cost once for the same scalar.
1797 if (!ExtractCostCalculated.insert(I->Scalar).second)
1798 continue;
1799
1800 // Uses by ephemeral values are free (because the ephemeral value will be
1801 // removed prior to code generation, and so the extraction will be
1802 // removed as well).
1803 if (EphValues.count(I->User))
1804 continue;
1805
1806 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
1807 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1808 I->Lane);
1809 }
1810
1811 Cost += getSpillCost();
1812
1813 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1814 return Cost + ExtractCost;
1815 }
1816
getGatherCost(Type * Ty)1817 int BoUpSLP::getGatherCost(Type *Ty) {
1818 int Cost = 0;
1819 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1820 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1821 return Cost;
1822 }
1823
getGatherCost(ArrayRef<Value * > VL)1824 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1825 // Find the type of the operands in VL.
1826 Type *ScalarTy = VL[0]->getType();
1827 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1828 ScalarTy = SI->getValueOperand()->getType();
1829 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1830 // Find the cost of inserting/extracting values from the vector.
1831 return getGatherCost(VecTy);
1832 }
1833
getPointerOperand(Value * I)1834 Value *BoUpSLP::getPointerOperand(Value *I) {
1835 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1836 return LI->getPointerOperand();
1837 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1838 return SI->getPointerOperand();
1839 return nullptr;
1840 }
1841
getAddressSpaceOperand(Value * I)1842 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
1843 if (LoadInst *L = dyn_cast<LoadInst>(I))
1844 return L->getPointerAddressSpace();
1845 if (StoreInst *S = dyn_cast<StoreInst>(I))
1846 return S->getPointerAddressSpace();
1847 return -1;
1848 }
1849
isConsecutiveAccess(Value * A,Value * B)1850 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
1851 Value *PtrA = getPointerOperand(A);
1852 Value *PtrB = getPointerOperand(B);
1853 unsigned ASA = getAddressSpaceOperand(A);
1854 unsigned ASB = getAddressSpaceOperand(B);
1855
1856 // Check that the address spaces match and that the pointers are valid.
1857 if (!PtrA || !PtrB || (ASA != ASB))
1858 return false;
1859
1860 // Make sure that A and B are different pointers of the same type.
1861 if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
1862 return false;
1863
1864 unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
1865 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1866 APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
1867
1868 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1869 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
1870 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
1871
1872 APInt OffsetDelta = OffsetB - OffsetA;
1873
1874 // Check if they are based on the same pointer. That makes the offsets
1875 // sufficient.
1876 if (PtrA == PtrB)
1877 return OffsetDelta == Size;
1878
1879 // Compute the necessary base pointer delta to have the necessary final delta
1880 // equal to the size.
1881 APInt BaseDelta = Size - OffsetDelta;
1882
1883 // Otherwise compute the distance with SCEV between the base pointers.
1884 const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1885 const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1886 const SCEV *C = SE->getConstant(BaseDelta);
1887 const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
1888 return X == PtrSCEVB;
1889 }
1890
setInsertPointAfterBundle(ArrayRef<Value * > VL)1891 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
1892 Instruction *VL0 = cast<Instruction>(VL[0]);
1893 BasicBlock::iterator NextInst = VL0;
1894 ++NextInst;
1895 Builder.SetInsertPoint(VL0->getParent(), NextInst);
1896 Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
1897 }
1898
Gather(ArrayRef<Value * > VL,VectorType * Ty)1899 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
1900 Value *Vec = UndefValue::get(Ty);
1901 // Generate the 'InsertElement' instruction.
1902 for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
1903 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
1904 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
1905 GatherSeq.insert(Insrt);
1906 CSEBlocks.insert(Insrt->getParent());
1907
1908 // Add to our 'need-to-extract' list.
1909 if (ScalarToTreeEntry.count(VL[i])) {
1910 int Idx = ScalarToTreeEntry[VL[i]];
1911 TreeEntry *E = &VectorizableTree[Idx];
1912 // Find which lane we need to extract.
1913 int FoundLane = -1;
1914 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
1915 // Is this the lane of the scalar that we are looking for ?
1916 if (E->Scalars[Lane] == VL[i]) {
1917 FoundLane = Lane;
1918 break;
1919 }
1920 }
1921 assert(FoundLane >= 0 && "Could not find the correct lane");
1922 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
1923 }
1924 }
1925 }
1926
1927 return Vec;
1928 }
1929
alreadyVectorized(ArrayRef<Value * > VL) const1930 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
1931 SmallDenseMap<Value*, int>::const_iterator Entry
1932 = ScalarToTreeEntry.find(VL[0]);
1933 if (Entry != ScalarToTreeEntry.end()) {
1934 int Idx = Entry->second;
1935 const TreeEntry *En = &VectorizableTree[Idx];
1936 if (En->isSame(VL) && En->VectorizedValue)
1937 return En->VectorizedValue;
1938 }
1939 return nullptr;
1940 }
1941
vectorizeTree(ArrayRef<Value * > VL)1942 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
1943 if (ScalarToTreeEntry.count(VL[0])) {
1944 int Idx = ScalarToTreeEntry[VL[0]];
1945 TreeEntry *E = &VectorizableTree[Idx];
1946 if (E->isSame(VL))
1947 return vectorizeTree(E);
1948 }
1949
1950 Type *ScalarTy = VL[0]->getType();
1951 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1952 ScalarTy = SI->getValueOperand()->getType();
1953 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1954
1955 return Gather(VL, VecTy);
1956 }
1957
vectorizeTree(TreeEntry * E)1958 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1959 IRBuilder<>::InsertPointGuard Guard(Builder);
1960
1961 if (E->VectorizedValue) {
1962 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
1963 return E->VectorizedValue;
1964 }
1965
1966 Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
1967 Type *ScalarTy = VL0->getType();
1968 if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
1969 ScalarTy = SI->getValueOperand()->getType();
1970 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
1971
1972 if (E->NeedToGather) {
1973 setInsertPointAfterBundle(E->Scalars);
1974 return Gather(E->Scalars, VecTy);
1975 }
1976
1977 unsigned Opcode = getSameOpcode(E->Scalars);
1978
1979 switch (Opcode) {
1980 case Instruction::PHI: {
1981 PHINode *PH = dyn_cast<PHINode>(VL0);
1982 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
1983 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
1984 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
1985 E->VectorizedValue = NewPhi;
1986
1987 // PHINodes may have multiple entries from the same block. We want to
1988 // visit every block once.
1989 SmallSet<BasicBlock*, 4> VisitedBBs;
1990
1991 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1992 ValueList Operands;
1993 BasicBlock *IBB = PH->getIncomingBlock(i);
1994
1995 if (!VisitedBBs.insert(IBB).second) {
1996 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
1997 continue;
1998 }
1999
2000 // Prepare the operand vector.
2001 for (unsigned j = 0; j < E->Scalars.size(); ++j)
2002 Operands.push_back(cast<PHINode>(E->Scalars[j])->
2003 getIncomingValueForBlock(IBB));
2004
2005 Builder.SetInsertPoint(IBB->getTerminator());
2006 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2007 Value *Vec = vectorizeTree(Operands);
2008 NewPhi->addIncoming(Vec, IBB);
2009 }
2010
2011 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2012 "Invalid number of incoming values");
2013 return NewPhi;
2014 }
2015
2016 case Instruction::ExtractElement: {
2017 if (CanReuseExtract(E->Scalars)) {
2018 Value *V = VL0->getOperand(0);
2019 E->VectorizedValue = V;
2020 return V;
2021 }
2022 return Gather(E->Scalars, VecTy);
2023 }
2024 case Instruction::ZExt:
2025 case Instruction::SExt:
2026 case Instruction::FPToUI:
2027 case Instruction::FPToSI:
2028 case Instruction::FPExt:
2029 case Instruction::PtrToInt:
2030 case Instruction::IntToPtr:
2031 case Instruction::SIToFP:
2032 case Instruction::UIToFP:
2033 case Instruction::Trunc:
2034 case Instruction::FPTrunc:
2035 case Instruction::BitCast: {
2036 ValueList INVL;
2037 for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2038 INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2039
2040 setInsertPointAfterBundle(E->Scalars);
2041
2042 Value *InVec = vectorizeTree(INVL);
2043
2044 if (Value *V = alreadyVectorized(E->Scalars))
2045 return V;
2046
2047 CastInst *CI = dyn_cast<CastInst>(VL0);
2048 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2049 E->VectorizedValue = V;
2050 ++NumVectorInstructions;
2051 return V;
2052 }
2053 case Instruction::FCmp:
2054 case Instruction::ICmp: {
2055 ValueList LHSV, RHSV;
2056 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2057 LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2058 RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2059 }
2060
2061 setInsertPointAfterBundle(E->Scalars);
2062
2063 Value *L = vectorizeTree(LHSV);
2064 Value *R = vectorizeTree(RHSV);
2065
2066 if (Value *V = alreadyVectorized(E->Scalars))
2067 return V;
2068
2069 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
2070 Value *V;
2071 if (Opcode == Instruction::FCmp)
2072 V = Builder.CreateFCmp(P0, L, R);
2073 else
2074 V = Builder.CreateICmp(P0, L, R);
2075
2076 E->VectorizedValue = V;
2077 ++NumVectorInstructions;
2078 return V;
2079 }
2080 case Instruction::Select: {
2081 ValueList TrueVec, FalseVec, CondVec;
2082 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2083 CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2084 TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2085 FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
2086 }
2087
2088 setInsertPointAfterBundle(E->Scalars);
2089
2090 Value *Cond = vectorizeTree(CondVec);
2091 Value *True = vectorizeTree(TrueVec);
2092 Value *False = vectorizeTree(FalseVec);
2093
2094 if (Value *V = alreadyVectorized(E->Scalars))
2095 return V;
2096
2097 Value *V = Builder.CreateSelect(Cond, True, False);
2098 E->VectorizedValue = V;
2099 ++NumVectorInstructions;
2100 return V;
2101 }
2102 case Instruction::Add:
2103 case Instruction::FAdd:
2104 case Instruction::Sub:
2105 case Instruction::FSub:
2106 case Instruction::Mul:
2107 case Instruction::FMul:
2108 case Instruction::UDiv:
2109 case Instruction::SDiv:
2110 case Instruction::FDiv:
2111 case Instruction::URem:
2112 case Instruction::SRem:
2113 case Instruction::FRem:
2114 case Instruction::Shl:
2115 case Instruction::LShr:
2116 case Instruction::AShr:
2117 case Instruction::And:
2118 case Instruction::Or:
2119 case Instruction::Xor: {
2120 ValueList LHSVL, RHSVL;
2121 if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2122 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2123 else
2124 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2125 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2126 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2127 }
2128
2129 setInsertPointAfterBundle(E->Scalars);
2130
2131 Value *LHS = vectorizeTree(LHSVL);
2132 Value *RHS = vectorizeTree(RHSVL);
2133
2134 if (LHS == RHS && isa<Instruction>(LHS)) {
2135 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2136 }
2137
2138 if (Value *V = alreadyVectorized(E->Scalars))
2139 return V;
2140
2141 BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2142 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2143 E->VectorizedValue = V;
2144 propagateIRFlags(E->VectorizedValue, E->Scalars);
2145 ++NumVectorInstructions;
2146
2147 if (Instruction *I = dyn_cast<Instruction>(V))
2148 return propagateMetadata(I, E->Scalars);
2149
2150 return V;
2151 }
2152 case Instruction::Load: {
2153 // Loads are inserted at the head of the tree because we don't want to
2154 // sink them all the way down past store instructions.
2155 setInsertPointAfterBundle(E->Scalars);
2156
2157 LoadInst *LI = cast<LoadInst>(VL0);
2158 Type *ScalarLoadTy = LI->getType();
2159 unsigned AS = LI->getPointerAddressSpace();
2160
2161 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2162 VecTy->getPointerTo(AS));
2163
2164 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2165 // ExternalUses list to make sure that an extract will be generated in the
2166 // future.
2167 if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2168 ExternalUses.push_back(
2169 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2170
2171 unsigned Alignment = LI->getAlignment();
2172 LI = Builder.CreateLoad(VecPtr);
2173 if (!Alignment)
2174 Alignment = DL->getABITypeAlignment(ScalarLoadTy);
2175 LI->setAlignment(Alignment);
2176 E->VectorizedValue = LI;
2177 ++NumVectorInstructions;
2178 return propagateMetadata(LI, E->Scalars);
2179 }
2180 case Instruction::Store: {
2181 StoreInst *SI = cast<StoreInst>(VL0);
2182 unsigned Alignment = SI->getAlignment();
2183 unsigned AS = SI->getPointerAddressSpace();
2184
2185 ValueList ValueOp;
2186 for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2187 ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
2188
2189 setInsertPointAfterBundle(E->Scalars);
2190
2191 Value *VecValue = vectorizeTree(ValueOp);
2192 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2193 VecTy->getPointerTo(AS));
2194 StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2195
2196 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2197 // ExternalUses list to make sure that an extract will be generated in the
2198 // future.
2199 if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2200 ExternalUses.push_back(
2201 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2202
2203 if (!Alignment)
2204 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
2205 S->setAlignment(Alignment);
2206 E->VectorizedValue = S;
2207 ++NumVectorInstructions;
2208 return propagateMetadata(S, E->Scalars);
2209 }
2210 case Instruction::GetElementPtr: {
2211 setInsertPointAfterBundle(E->Scalars);
2212
2213 ValueList Op0VL;
2214 for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2215 Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
2216
2217 Value *Op0 = vectorizeTree(Op0VL);
2218
2219 std::vector<Value *> OpVecs;
2220 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2221 ++j) {
2222 ValueList OpVL;
2223 for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2224 OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
2225
2226 Value *OpVec = vectorizeTree(OpVL);
2227 OpVecs.push_back(OpVec);
2228 }
2229
2230 Value *V = Builder.CreateGEP(Op0, OpVecs);
2231 E->VectorizedValue = V;
2232 ++NumVectorInstructions;
2233
2234 if (Instruction *I = dyn_cast<Instruction>(V))
2235 return propagateMetadata(I, E->Scalars);
2236
2237 return V;
2238 }
2239 case Instruction::Call: {
2240 CallInst *CI = cast<CallInst>(VL0);
2241 setInsertPointAfterBundle(E->Scalars);
2242 Function *FI;
2243 Intrinsic::ID IID = Intrinsic::not_intrinsic;
2244 Value *ScalarArg = nullptr;
2245 if (CI && (FI = CI->getCalledFunction())) {
2246 IID = (Intrinsic::ID) FI->getIntrinsicID();
2247 }
2248 std::vector<Value *> OpVecs;
2249 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2250 ValueList OpVL;
2251 // ctlz,cttz and powi are special intrinsics whose second argument is
2252 // a scalar. This argument should not be vectorized.
2253 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2254 CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2255 ScalarArg = CEI->getArgOperand(j);
2256 OpVecs.push_back(CEI->getArgOperand(j));
2257 continue;
2258 }
2259 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2260 CallInst *CEI = cast<CallInst>(E->Scalars[i]);
2261 OpVL.push_back(CEI->getArgOperand(j));
2262 }
2263
2264 Value *OpVec = vectorizeTree(OpVL);
2265 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2266 OpVecs.push_back(OpVec);
2267 }
2268
2269 Module *M = F->getParent();
2270 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2271 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2272 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2273 Value *V = Builder.CreateCall(CF, OpVecs);
2274
2275 // The scalar argument uses an in-tree scalar so we add the new vectorized
2276 // call to ExternalUses list to make sure that an extract will be
2277 // generated in the future.
2278 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2279 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2280
2281 E->VectorizedValue = V;
2282 ++NumVectorInstructions;
2283 return V;
2284 }
2285 case Instruction::ShuffleVector: {
2286 ValueList LHSVL, RHSVL;
2287 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2288 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2289 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2290 }
2291 setInsertPointAfterBundle(E->Scalars);
2292
2293 Value *LHS = vectorizeTree(LHSVL);
2294 Value *RHS = vectorizeTree(RHSVL);
2295
2296 if (Value *V = alreadyVectorized(E->Scalars))
2297 return V;
2298
2299 // Create a vector of LHS op1 RHS
2300 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2301 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2302
2303 // Create a vector of LHS op2 RHS
2304 Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2305 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2306 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2307
2308 // Create shuffle to take alternate operations from the vector.
2309 // Also, gather up odd and even scalar ops to propagate IR flags to
2310 // each vector operation.
2311 ValueList OddScalars, EvenScalars;
2312 unsigned e = E->Scalars.size();
2313 SmallVector<Constant *, 8> Mask(e);
2314 for (unsigned i = 0; i < e; ++i) {
2315 if (i & 1) {
2316 Mask[i] = Builder.getInt32(e + i);
2317 OddScalars.push_back(E->Scalars[i]);
2318 } else {
2319 Mask[i] = Builder.getInt32(i);
2320 EvenScalars.push_back(E->Scalars[i]);
2321 }
2322 }
2323
2324 Value *ShuffleMask = ConstantVector::get(Mask);
2325 propagateIRFlags(V0, EvenScalars);
2326 propagateIRFlags(V1, OddScalars);
2327
2328 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2329 E->VectorizedValue = V;
2330 ++NumVectorInstructions;
2331 if (Instruction *I = dyn_cast<Instruction>(V))
2332 return propagateMetadata(I, E->Scalars);
2333
2334 return V;
2335 }
2336 default:
2337 llvm_unreachable("unknown inst");
2338 }
2339 return nullptr;
2340 }
2341
vectorizeTree()2342 Value *BoUpSLP::vectorizeTree() {
2343
2344 // All blocks must be scheduled before any instructions are inserted.
2345 for (auto &BSIter : BlocksSchedules) {
2346 scheduleBlock(BSIter.second.get());
2347 }
2348
2349 Builder.SetInsertPoint(F->getEntryBlock().begin());
2350 vectorizeTree(&VectorizableTree[0]);
2351
2352 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2353
2354 // Extract all of the elements with the external uses.
2355 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2356 it != e; ++it) {
2357 Value *Scalar = it->Scalar;
2358 llvm::User *User = it->User;
2359
2360 // Skip users that we already RAUW. This happens when one instruction
2361 // has multiple uses of the same value.
2362 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2363 Scalar->user_end())
2364 continue;
2365 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2366
2367 int Idx = ScalarToTreeEntry[Scalar];
2368 TreeEntry *E = &VectorizableTree[Idx];
2369 assert(!E->NeedToGather && "Extracting from a gather list");
2370
2371 Value *Vec = E->VectorizedValue;
2372 assert(Vec && "Can't find vectorizable value");
2373
2374 Value *Lane = Builder.getInt32(it->Lane);
2375 // Generate extracts for out-of-tree users.
2376 // Find the insertion point for the extractelement lane.
2377 if (isa<Instruction>(Vec)){
2378 if (PHINode *PH = dyn_cast<PHINode>(User)) {
2379 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2380 if (PH->getIncomingValue(i) == Scalar) {
2381 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2382 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2383 CSEBlocks.insert(PH->getIncomingBlock(i));
2384 PH->setOperand(i, Ex);
2385 }
2386 }
2387 } else {
2388 Builder.SetInsertPoint(cast<Instruction>(User));
2389 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2390 CSEBlocks.insert(cast<Instruction>(User)->getParent());
2391 User->replaceUsesOfWith(Scalar, Ex);
2392 }
2393 } else {
2394 Builder.SetInsertPoint(F->getEntryBlock().begin());
2395 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2396 CSEBlocks.insert(&F->getEntryBlock());
2397 User->replaceUsesOfWith(Scalar, Ex);
2398 }
2399
2400 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2401 }
2402
2403 // For each vectorized value:
2404 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2405 TreeEntry *Entry = &VectorizableTree[EIdx];
2406
2407 // For each lane:
2408 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2409 Value *Scalar = Entry->Scalars[Lane];
2410 // No need to handle users of gathered values.
2411 if (Entry->NeedToGather)
2412 continue;
2413
2414 assert(Entry->VectorizedValue && "Can't find vectorizable value");
2415
2416 Type *Ty = Scalar->getType();
2417 if (!Ty->isVoidTy()) {
2418 #ifndef NDEBUG
2419 for (User *U : Scalar->users()) {
2420 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2421
2422 assert((ScalarToTreeEntry.count(U) ||
2423 // It is legal to replace users in the ignorelist by undef.
2424 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2425 UserIgnoreList.end())) &&
2426 "Replacing out-of-tree value with undef");
2427 }
2428 #endif
2429 Value *Undef = UndefValue::get(Ty);
2430 Scalar->replaceAllUsesWith(Undef);
2431 }
2432 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2433 eraseInstruction(cast<Instruction>(Scalar));
2434 }
2435 }
2436
2437 Builder.ClearInsertionPoint();
2438
2439 return VectorizableTree[0].VectorizedValue;
2440 }
2441
optimizeGatherSequence()2442 void BoUpSLP::optimizeGatherSequence() {
2443 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2444 << " gather sequences instructions.\n");
2445 // LICM InsertElementInst sequences.
2446 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2447 e = GatherSeq.end(); it != e; ++it) {
2448 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2449
2450 if (!Insert)
2451 continue;
2452
2453 // Check if this block is inside a loop.
2454 Loop *L = LI->getLoopFor(Insert->getParent());
2455 if (!L)
2456 continue;
2457
2458 // Check if it has a preheader.
2459 BasicBlock *PreHeader = L->getLoopPreheader();
2460 if (!PreHeader)
2461 continue;
2462
2463 // If the vector or the element that we insert into it are
2464 // instructions that are defined in this basic block then we can't
2465 // hoist this instruction.
2466 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2467 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2468 if (CurrVec && L->contains(CurrVec))
2469 continue;
2470 if (NewElem && L->contains(NewElem))
2471 continue;
2472
2473 // We can hoist this instruction. Move it to the pre-header.
2474 Insert->moveBefore(PreHeader->getTerminator());
2475 }
2476
2477 // Make a list of all reachable blocks in our CSE queue.
2478 SmallVector<const DomTreeNode *, 8> CSEWorkList;
2479 CSEWorkList.reserve(CSEBlocks.size());
2480 for (BasicBlock *BB : CSEBlocks)
2481 if (DomTreeNode *N = DT->getNode(BB)) {
2482 assert(DT->isReachableFromEntry(N));
2483 CSEWorkList.push_back(N);
2484 }
2485
2486 // Sort blocks by domination. This ensures we visit a block after all blocks
2487 // dominating it are visited.
2488 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2489 [this](const DomTreeNode *A, const DomTreeNode *B) {
2490 return DT->properlyDominates(A, B);
2491 });
2492
2493 // Perform O(N^2) search over the gather sequences and merge identical
2494 // instructions. TODO: We can further optimize this scan if we split the
2495 // instructions into different buckets based on the insert lane.
2496 SmallVector<Instruction *, 16> Visited;
2497 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2498 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2499 "Worklist not sorted properly!");
2500 BasicBlock *BB = (*I)->getBlock();
2501 // For all instructions in blocks containing gather sequences:
2502 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2503 Instruction *In = it++;
2504 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2505 continue;
2506
2507 // Check if we can replace this instruction with any of the
2508 // visited instructions.
2509 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2510 ve = Visited.end();
2511 v != ve; ++v) {
2512 if (In->isIdenticalTo(*v) &&
2513 DT->dominates((*v)->getParent(), In->getParent())) {
2514 In->replaceAllUsesWith(*v);
2515 eraseInstruction(In);
2516 In = nullptr;
2517 break;
2518 }
2519 }
2520 if (In) {
2521 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2522 Visited.push_back(In);
2523 }
2524 }
2525 }
2526 CSEBlocks.clear();
2527 GatherSeq.clear();
2528 }
2529
2530 // Groups the instructions to a bundle (which is then a single scheduling entity)
2531 // and schedules instructions until the bundle gets ready.
tryScheduleBundle(ArrayRef<Value * > VL,BoUpSLP * SLP)2532 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2533 BoUpSLP *SLP) {
2534 if (isa<PHINode>(VL[0]))
2535 return true;
2536
2537 // Initialize the instruction bundle.
2538 Instruction *OldScheduleEnd = ScheduleEnd;
2539 ScheduleData *PrevInBundle = nullptr;
2540 ScheduleData *Bundle = nullptr;
2541 bool ReSchedule = false;
2542 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
2543 for (Value *V : VL) {
2544 extendSchedulingRegion(V);
2545 ScheduleData *BundleMember = getScheduleData(V);
2546 assert(BundleMember &&
2547 "no ScheduleData for bundle member (maybe not in same basic block)");
2548 if (BundleMember->IsScheduled) {
2549 // A bundle member was scheduled as single instruction before and now
2550 // needs to be scheduled as part of the bundle. We just get rid of the
2551 // existing schedule.
2552 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
2553 << " was already scheduled\n");
2554 ReSchedule = true;
2555 }
2556 assert(BundleMember->isSchedulingEntity() &&
2557 "bundle member already part of other bundle");
2558 if (PrevInBundle) {
2559 PrevInBundle->NextInBundle = BundleMember;
2560 } else {
2561 Bundle = BundleMember;
2562 }
2563 BundleMember->UnscheduledDepsInBundle = 0;
2564 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2565
2566 // Group the instructions to a bundle.
2567 BundleMember->FirstInBundle = Bundle;
2568 PrevInBundle = BundleMember;
2569 }
2570 if (ScheduleEnd != OldScheduleEnd) {
2571 // The scheduling region got new instructions at the lower end (or it is a
2572 // new region for the first bundle). This makes it necessary to
2573 // recalculate all dependencies.
2574 // It is seldom that this needs to be done a second time after adding the
2575 // initial bundle to the region.
2576 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2577 ScheduleData *SD = getScheduleData(I);
2578 SD->clearDependencies();
2579 }
2580 ReSchedule = true;
2581 }
2582 if (ReSchedule) {
2583 resetSchedule();
2584 initialFillReadyList(ReadyInsts);
2585 }
2586
2587 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2588 << BB->getName() << "\n");
2589
2590 calculateDependencies(Bundle, true, SLP);
2591
2592 // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2593 // means that there are no cyclic dependencies and we can schedule it.
2594 // Note that's important that we don't "schedule" the bundle yet (see
2595 // cancelScheduling).
2596 while (!Bundle->isReady() && !ReadyInsts.empty()) {
2597
2598 ScheduleData *pickedSD = ReadyInsts.back();
2599 ReadyInsts.pop_back();
2600
2601 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2602 schedule(pickedSD, ReadyInsts);
2603 }
2604 }
2605 return Bundle->isReady();
2606 }
2607
cancelScheduling(ArrayRef<Value * > VL)2608 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2609 if (isa<PHINode>(VL[0]))
2610 return;
2611
2612 ScheduleData *Bundle = getScheduleData(VL[0]);
2613 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
2614 assert(!Bundle->IsScheduled &&
2615 "Can't cancel bundle which is already scheduled");
2616 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2617 "tried to unbundle something which is not a bundle");
2618
2619 // Un-bundle: make single instructions out of the bundle.
2620 ScheduleData *BundleMember = Bundle;
2621 while (BundleMember) {
2622 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2623 BundleMember->FirstInBundle = BundleMember;
2624 ScheduleData *Next = BundleMember->NextInBundle;
2625 BundleMember->NextInBundle = nullptr;
2626 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2627 if (BundleMember->UnscheduledDepsInBundle == 0) {
2628 ReadyInsts.insert(BundleMember);
2629 }
2630 BundleMember = Next;
2631 }
2632 }
2633
extendSchedulingRegion(Value * V)2634 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2635 if (getScheduleData(V))
2636 return;
2637 Instruction *I = dyn_cast<Instruction>(V);
2638 assert(I && "bundle member must be an instruction");
2639 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2640 if (!ScheduleStart) {
2641 // It's the first instruction in the new region.
2642 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2643 ScheduleStart = I;
2644 ScheduleEnd = I->getNextNode();
2645 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2646 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
2647 return;
2648 }
2649 // Search up and down at the same time, because we don't know if the new
2650 // instruction is above or below the existing scheduling region.
2651 BasicBlock::reverse_iterator UpIter(ScheduleStart);
2652 BasicBlock::reverse_iterator UpperEnd = BB->rend();
2653 BasicBlock::iterator DownIter(ScheduleEnd);
2654 BasicBlock::iterator LowerEnd = BB->end();
2655 for (;;) {
2656 if (UpIter != UpperEnd) {
2657 if (&*UpIter == I) {
2658 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2659 ScheduleStart = I;
2660 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
2661 return;
2662 }
2663 UpIter++;
2664 }
2665 if (DownIter != LowerEnd) {
2666 if (&*DownIter == I) {
2667 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2668 nullptr);
2669 ScheduleEnd = I->getNextNode();
2670 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2671 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
2672 return;
2673 }
2674 DownIter++;
2675 }
2676 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2677 "instruction not found in block");
2678 }
2679 }
2680
initScheduleData(Instruction * FromI,Instruction * ToI,ScheduleData * PrevLoadStore,ScheduleData * NextLoadStore)2681 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2682 Instruction *ToI,
2683 ScheduleData *PrevLoadStore,
2684 ScheduleData *NextLoadStore) {
2685 ScheduleData *CurrentLoadStore = PrevLoadStore;
2686 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2687 ScheduleData *SD = ScheduleDataMap[I];
2688 if (!SD) {
2689 // Allocate a new ScheduleData for the instruction.
2690 if (ChunkPos >= ChunkSize) {
2691 ScheduleDataChunks.push_back(
2692 llvm::make_unique<ScheduleData[]>(ChunkSize));
2693 ChunkPos = 0;
2694 }
2695 SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2696 ScheduleDataMap[I] = SD;
2697 SD->Inst = I;
2698 }
2699 assert(!isInSchedulingRegion(SD) &&
2700 "new ScheduleData already in scheduling region");
2701 SD->init(SchedulingRegionID);
2702
2703 if (I->mayReadOrWriteMemory()) {
2704 // Update the linked list of memory accessing instructions.
2705 if (CurrentLoadStore) {
2706 CurrentLoadStore->NextLoadStore = SD;
2707 } else {
2708 FirstLoadStoreInRegion = SD;
2709 }
2710 CurrentLoadStore = SD;
2711 }
2712 }
2713 if (NextLoadStore) {
2714 if (CurrentLoadStore)
2715 CurrentLoadStore->NextLoadStore = NextLoadStore;
2716 } else {
2717 LastLoadStoreInRegion = CurrentLoadStore;
2718 }
2719 }
2720
calculateDependencies(ScheduleData * SD,bool InsertInReadyList,BoUpSLP * SLP)2721 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2722 bool InsertInReadyList,
2723 BoUpSLP *SLP) {
2724 assert(SD->isSchedulingEntity());
2725
2726 SmallVector<ScheduleData *, 10> WorkList;
2727 WorkList.push_back(SD);
2728
2729 while (!WorkList.empty()) {
2730 ScheduleData *SD = WorkList.back();
2731 WorkList.pop_back();
2732
2733 ScheduleData *BundleMember = SD;
2734 while (BundleMember) {
2735 assert(isInSchedulingRegion(BundleMember));
2736 if (!BundleMember->hasValidDependencies()) {
2737
2738 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
2739 BundleMember->Dependencies = 0;
2740 BundleMember->resetUnscheduledDeps();
2741
2742 // Handle def-use chain dependencies.
2743 for (User *U : BundleMember->Inst->users()) {
2744 if (isa<Instruction>(U)) {
2745 ScheduleData *UseSD = getScheduleData(U);
2746 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2747 BundleMember->Dependencies++;
2748 ScheduleData *DestBundle = UseSD->FirstInBundle;
2749 if (!DestBundle->IsScheduled) {
2750 BundleMember->incrementUnscheduledDeps(1);
2751 }
2752 if (!DestBundle->hasValidDependencies()) {
2753 WorkList.push_back(DestBundle);
2754 }
2755 }
2756 } else {
2757 // I'm not sure if this can ever happen. But we need to be safe.
2758 // This lets the instruction/bundle never be scheduled and eventally
2759 // disable vectorization.
2760 BundleMember->Dependencies++;
2761 BundleMember->incrementUnscheduledDeps(1);
2762 }
2763 }
2764
2765 // Handle the memory dependencies.
2766 ScheduleData *DepDest = BundleMember->NextLoadStore;
2767 if (DepDest) {
2768 Instruction *SrcInst = BundleMember->Inst;
2769 AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA);
2770 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2771
2772 while (DepDest) {
2773 assert(isInSchedulingRegion(DepDest));
2774 if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
2775 if (SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)) {
2776 DepDest->MemoryDependencies.push_back(BundleMember);
2777 BundleMember->Dependencies++;
2778 ScheduleData *DestBundle = DepDest->FirstInBundle;
2779 if (!DestBundle->IsScheduled) {
2780 BundleMember->incrementUnscheduledDeps(1);
2781 }
2782 if (!DestBundle->hasValidDependencies()) {
2783 WorkList.push_back(DestBundle);
2784 }
2785 }
2786 }
2787 DepDest = DepDest->NextLoadStore;
2788 }
2789 }
2790 }
2791 BundleMember = BundleMember->NextInBundle;
2792 }
2793 if (InsertInReadyList && SD->isReady()) {
2794 ReadyInsts.push_back(SD);
2795 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
2796 }
2797 }
2798 }
2799
resetSchedule()2800 void BoUpSLP::BlockScheduling::resetSchedule() {
2801 assert(ScheduleStart &&
2802 "tried to reset schedule on block which has not been scheduled");
2803 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2804 ScheduleData *SD = getScheduleData(I);
2805 assert(isInSchedulingRegion(SD));
2806 SD->IsScheduled = false;
2807 SD->resetUnscheduledDeps();
2808 }
2809 ReadyInsts.clear();
2810 }
2811
scheduleBlock(BlockScheduling * BS)2812 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2813
2814 if (!BS->ScheduleStart)
2815 return;
2816
2817 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
2818
2819 BS->resetSchedule();
2820
2821 // For the real scheduling we use a more sophisticated ready-list: it is
2822 // sorted by the original instruction location. This lets the final schedule
2823 // be as close as possible to the original instruction order.
2824 struct ScheduleDataCompare {
2825 bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
2826 return SD2->SchedulingPriority < SD1->SchedulingPriority;
2827 }
2828 };
2829 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
2830
2831 // Ensure that all depencency data is updated and fill the ready-list with
2832 // initial instructions.
2833 int Idx = 0;
2834 int NumToSchedule = 0;
2835 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
2836 I = I->getNextNode()) {
2837 ScheduleData *SD = BS->getScheduleData(I);
2838 assert(
2839 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
2840 "scheduler and vectorizer have different opinion on what is a bundle");
2841 SD->FirstInBundle->SchedulingPriority = Idx++;
2842 if (SD->isSchedulingEntity()) {
2843 BS->calculateDependencies(SD, false, this);
2844 NumToSchedule++;
2845 }
2846 }
2847 BS->initialFillReadyList(ReadyInsts);
2848
2849 Instruction *LastScheduledInst = BS->ScheduleEnd;
2850
2851 // Do the "real" scheduling.
2852 while (!ReadyInsts.empty()) {
2853 ScheduleData *picked = *ReadyInsts.begin();
2854 ReadyInsts.erase(ReadyInsts.begin());
2855
2856 // Move the scheduled instruction(s) to their dedicated places, if not
2857 // there yet.
2858 ScheduleData *BundleMember = picked;
2859 while (BundleMember) {
2860 Instruction *pickedInst = BundleMember->Inst;
2861 if (LastScheduledInst->getNextNode() != pickedInst) {
2862 BS->BB->getInstList().remove(pickedInst);
2863 BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
2864 }
2865 LastScheduledInst = pickedInst;
2866 BundleMember = BundleMember->NextInBundle;
2867 }
2868
2869 BS->schedule(picked, ReadyInsts);
2870 NumToSchedule--;
2871 }
2872 assert(NumToSchedule == 0 && "could not schedule all instructions");
2873
2874 // Avoid duplicate scheduling of the block.
2875 BS->ScheduleStart = nullptr;
2876 }
2877
2878 /// The SLPVectorizer Pass.
2879 struct SLPVectorizer : public FunctionPass {
2880 typedef SmallVector<StoreInst *, 8> StoreList;
2881 typedef MapVector<Value *, StoreList> StoreListMap;
2882
2883 /// Pass identification, replacement for typeid
2884 static char ID;
2885
SLPVectorizer__anon6868a6e70111::SLPVectorizer2886 explicit SLPVectorizer() : FunctionPass(ID) {
2887 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
2888 }
2889
2890 ScalarEvolution *SE;
2891 const DataLayout *DL;
2892 TargetTransformInfo *TTI;
2893 TargetLibraryInfo *TLI;
2894 AliasAnalysis *AA;
2895 LoopInfo *LI;
2896 DominatorTree *DT;
2897 AssumptionCache *AC;
2898
runOnFunction__anon6868a6e70111::SLPVectorizer2899 bool runOnFunction(Function &F) override {
2900 if (skipOptnoneFunction(F))
2901 return false;
2902
2903 SE = &getAnalysis<ScalarEvolution>();
2904 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
2905 DL = DLP ? &DLP->getDataLayout() : nullptr;
2906 TTI = &getAnalysis<TargetTransformInfo>();
2907 TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
2908 AA = &getAnalysis<AliasAnalysis>();
2909 LI = &getAnalysis<LoopInfo>();
2910 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2911 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2912
2913 StoreRefs.clear();
2914 bool Changed = false;
2915
2916 // If the target claims to have no vector registers don't attempt
2917 // vectorization.
2918 if (!TTI->getNumberOfRegisters(true))
2919 return false;
2920
2921 // Must have DataLayout. We can't require it because some tests run w/o
2922 // triple.
2923 if (!DL)
2924 return false;
2925
2926 // Don't vectorize when the attribute NoImplicitFloat is used.
2927 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
2928 return false;
2929
2930 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
2931
2932 // Use the bottom up slp vectorizer to construct chains that start with
2933 // store instructions.
2934 BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC);
2935
2936 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
2937 // delete instructions.
2938
2939 // Scan the blocks in the function in post order.
2940 for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
2941 e = po_end(&F.getEntryBlock()); it != e; ++it) {
2942 BasicBlock *BB = *it;
2943 // Vectorize trees that end at stores.
2944 if (unsigned count = collectStores(BB, R)) {
2945 (void)count;
2946 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
2947 Changed |= vectorizeStoreChains(R);
2948 }
2949
2950 // Vectorize trees that end at reductions.
2951 Changed |= vectorizeChainsInBlock(BB, R);
2952 }
2953
2954 if (Changed) {
2955 R.optimizeGatherSequence();
2956 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
2957 DEBUG(verifyFunction(F));
2958 }
2959 return Changed;
2960 }
2961
getAnalysisUsage__anon6868a6e70111::SLPVectorizer2962 void getAnalysisUsage(AnalysisUsage &AU) const override {
2963 FunctionPass::getAnalysisUsage(AU);
2964 AU.addRequired<AssumptionCacheTracker>();
2965 AU.addRequired<ScalarEvolution>();
2966 AU.addRequired<AliasAnalysis>();
2967 AU.addRequired<TargetTransformInfo>();
2968 AU.addRequired<LoopInfo>();
2969 AU.addRequired<DominatorTreeWrapperPass>();
2970 AU.addPreserved<LoopInfo>();
2971 AU.addPreserved<DominatorTreeWrapperPass>();
2972 AU.setPreservesCFG();
2973 }
2974
2975 private:
2976
2977 /// \brief Collect memory references and sort them according to their base
2978 /// object. We sort the stores to their base objects to reduce the cost of the
2979 /// quadratic search on the stores. TODO: We can further reduce this cost
2980 /// if we flush the chain creation every time we run into a memory barrier.
2981 unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
2982
2983 /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
2984 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
2985
2986 /// \brief Try to vectorize a list of operands.
2987 /// \@param BuildVector A list of users to ignore for the purpose of
2988 /// scheduling and that don't need extracting.
2989 /// \returns true if a value was vectorized.
2990 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
2991 ArrayRef<Value *> BuildVector = None,
2992 bool allowReorder = false);
2993
2994 /// \brief Try to vectorize a chain that may start at the operands of \V;
2995 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
2996
2997 /// \brief Vectorize the stores that were collected in StoreRefs.
2998 bool vectorizeStoreChains(BoUpSLP &R);
2999
3000 /// \brief Scan the basic block and look for patterns that are likely to start
3001 /// a vectorization chain.
3002 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
3003
3004 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
3005 BoUpSLP &R);
3006
3007 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
3008 BoUpSLP &R);
3009 private:
3010 StoreListMap StoreRefs;
3011 };
3012
3013 /// \brief Check that the Values in the slice in VL array are still existent in
3014 /// the WeakVH array.
3015 /// Vectorization of part of the VL array may cause later values in the VL array
3016 /// to become invalid. We track when this has happened in the WeakVH array.
hasValueBeenRAUWed(ArrayRef<Value * > & VL,SmallVectorImpl<WeakVH> & VH,unsigned SliceBegin,unsigned SliceSize)3017 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
3018 SmallVectorImpl<WeakVH> &VH,
3019 unsigned SliceBegin,
3020 unsigned SliceSize) {
3021 for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
3022 if (VH[i] != VL[i])
3023 return true;
3024
3025 return false;
3026 }
3027
vectorizeStoreChain(ArrayRef<Value * > Chain,int CostThreshold,BoUpSLP & R)3028 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
3029 int CostThreshold, BoUpSLP &R) {
3030 unsigned ChainLen = Chain.size();
3031 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3032 << "\n");
3033 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
3034 unsigned Sz = DL->getTypeSizeInBits(StoreTy);
3035 unsigned VF = MinVecRegSize / Sz;
3036
3037 if (!isPowerOf2_32(Sz) || VF < 2)
3038 return false;
3039
3040 // Keep track of values that were deleted by vectorizing in the loop below.
3041 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3042
3043 bool Changed = false;
3044 // Look for profitable vectorizable trees at all offsets, starting at zero.
3045 for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3046 if (i + VF > e)
3047 break;
3048
3049 // Check that a previous iteration of this loop did not delete the Value.
3050 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3051 continue;
3052
3053 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3054 << "\n");
3055 ArrayRef<Value *> Operands = Chain.slice(i, VF);
3056
3057 R.buildTree(Operands);
3058
3059 int Cost = R.getTreeCost();
3060
3061 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3062 if (Cost < CostThreshold) {
3063 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3064 R.vectorizeTree();
3065
3066 // Move to the next bundle.
3067 i += VF - 1;
3068 Changed = true;
3069 }
3070 }
3071
3072 return Changed;
3073 }
3074
vectorizeStores(ArrayRef<StoreInst * > Stores,int costThreshold,BoUpSLP & R)3075 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3076 int costThreshold, BoUpSLP &R) {
3077 SetVector<Value *> Heads, Tails;
3078 SmallDenseMap<Value *, Value *> ConsecutiveChain;
3079
3080 // We may run into multiple chains that merge into a single chain. We mark the
3081 // stores that we vectorized so that we don't visit the same store twice.
3082 BoUpSLP::ValueSet VectorizedStores;
3083 bool Changed = false;
3084
3085 // Do a quadratic search on all of the given stores and find
3086 // all of the pairs of stores that follow each other.
3087 for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3088 for (unsigned j = 0; j < e; ++j) {
3089 if (i == j)
3090 continue;
3091
3092 if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
3093 Tails.insert(Stores[j]);
3094 Heads.insert(Stores[i]);
3095 ConsecutiveChain[Stores[i]] = Stores[j];
3096 }
3097 }
3098 }
3099
3100 // For stores that start but don't end a link in the chain:
3101 for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
3102 it != e; ++it) {
3103 if (Tails.count(*it))
3104 continue;
3105
3106 // We found a store instr that starts a chain. Now follow the chain and try
3107 // to vectorize it.
3108 BoUpSLP::ValueList Operands;
3109 Value *I = *it;
3110 // Collect the chain into a list.
3111 while (Tails.count(I) || Heads.count(I)) {
3112 if (VectorizedStores.count(I))
3113 break;
3114 Operands.push_back(I);
3115 // Move to the next value in the chain.
3116 I = ConsecutiveChain[I];
3117 }
3118
3119 bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
3120
3121 // Mark the vectorized stores so that we don't vectorize them again.
3122 if (Vectorized)
3123 VectorizedStores.insert(Operands.begin(), Operands.end());
3124 Changed |= Vectorized;
3125 }
3126
3127 return Changed;
3128 }
3129
3130
collectStores(BasicBlock * BB,BoUpSLP & R)3131 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
3132 unsigned count = 0;
3133 StoreRefs.clear();
3134 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
3135 StoreInst *SI = dyn_cast<StoreInst>(it);
3136 if (!SI)
3137 continue;
3138
3139 // Don't touch volatile stores.
3140 if (!SI->isSimple())
3141 continue;
3142
3143 // Check that the pointer points to scalars.
3144 Type *Ty = SI->getValueOperand()->getType();
3145 if (!isValidElementType(Ty))
3146 continue;
3147
3148 // Find the base pointer.
3149 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
3150
3151 // Save the store locations.
3152 StoreRefs[Ptr].push_back(SI);
3153 count++;
3154 }
3155 return count;
3156 }
3157
tryToVectorizePair(Value * A,Value * B,BoUpSLP & R)3158 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3159 if (!A || !B)
3160 return false;
3161 Value *VL[] = { A, B };
3162 return tryToVectorizeList(VL, R, None, true);
3163 }
3164
tryToVectorizeList(ArrayRef<Value * > VL,BoUpSLP & R,ArrayRef<Value * > BuildVector,bool allowReorder)3165 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3166 ArrayRef<Value *> BuildVector,
3167 bool allowReorder) {
3168 if (VL.size() < 2)
3169 return false;
3170
3171 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3172
3173 // Check that all of the parts are scalar instructions of the same type.
3174 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3175 if (!I0)
3176 return false;
3177
3178 unsigned Opcode0 = I0->getOpcode();
3179
3180 Type *Ty0 = I0->getType();
3181 unsigned Sz = DL->getTypeSizeInBits(Ty0);
3182 unsigned VF = MinVecRegSize / Sz;
3183
3184 for (int i = 0, e = VL.size(); i < e; ++i) {
3185 Type *Ty = VL[i]->getType();
3186 if (!isValidElementType(Ty))
3187 return false;
3188 Instruction *Inst = dyn_cast<Instruction>(VL[i]);
3189 if (!Inst || Inst->getOpcode() != Opcode0)
3190 return false;
3191 }
3192
3193 bool Changed = false;
3194
3195 // Keep track of values that were deleted by vectorizing in the loop below.
3196 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3197
3198 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3199 unsigned OpsWidth = 0;
3200
3201 if (i + VF > e)
3202 OpsWidth = e - i;
3203 else
3204 OpsWidth = VF;
3205
3206 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3207 break;
3208
3209 // Check that a previous iteration of this loop did not delete the Value.
3210 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3211 continue;
3212
3213 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3214 << "\n");
3215 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3216
3217 ArrayRef<Value *> BuildVectorSlice;
3218 if (!BuildVector.empty())
3219 BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3220
3221 R.buildTree(Ops, BuildVectorSlice);
3222 // TODO: check if we can allow reordering also for other cases than
3223 // tryToVectorizePair()
3224 if (allowReorder && R.shouldReorder()) {
3225 assert(Ops.size() == 2);
3226 assert(BuildVectorSlice.empty());
3227 Value *ReorderedOps[] = { Ops[1], Ops[0] };
3228 R.buildTree(ReorderedOps, None);
3229 }
3230 int Cost = R.getTreeCost();
3231
3232 if (Cost < -SLPCostThreshold) {
3233 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3234 Value *VectorizedRoot = R.vectorizeTree();
3235
3236 // Reconstruct the build vector by extracting the vectorized root. This
3237 // way we handle the case where some elements of the vector are undefined.
3238 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3239 if (!BuildVectorSlice.empty()) {
3240 // The insert point is the last build vector instruction. The vectorized
3241 // root will precede it. This guarantees that we get an instruction. The
3242 // vectorized tree could have been constant folded.
3243 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3244 unsigned VecIdx = 0;
3245 for (auto &V : BuildVectorSlice) {
3246 IRBuilder<true, NoFolder> Builder(
3247 ++BasicBlock::iterator(InsertAfter));
3248 InsertElementInst *IE = cast<InsertElementInst>(V);
3249 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3250 VectorizedRoot, Builder.getInt32(VecIdx++)));
3251 IE->setOperand(1, Extract);
3252 IE->removeFromParent();
3253 IE->insertAfter(Extract);
3254 InsertAfter = IE;
3255 }
3256 }
3257 // Move to the next bundle.
3258 i += VF - 1;
3259 Changed = true;
3260 }
3261 }
3262
3263 return Changed;
3264 }
3265
tryToVectorize(BinaryOperator * V,BoUpSLP & R)3266 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3267 if (!V)
3268 return false;
3269
3270 // Try to vectorize V.
3271 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3272 return true;
3273
3274 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3275 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3276 // Try to skip B.
3277 if (B && B->hasOneUse()) {
3278 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3279 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3280 if (tryToVectorizePair(A, B0, R)) {
3281 return true;
3282 }
3283 if (tryToVectorizePair(A, B1, R)) {
3284 return true;
3285 }
3286 }
3287
3288 // Try to skip A.
3289 if (A && A->hasOneUse()) {
3290 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3291 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3292 if (tryToVectorizePair(A0, B, R)) {
3293 return true;
3294 }
3295 if (tryToVectorizePair(A1, B, R)) {
3296 return true;
3297 }
3298 }
3299 return 0;
3300 }
3301
3302 /// \brief Generate a shuffle mask to be used in a reduction tree.
3303 ///
3304 /// \param VecLen The length of the vector to be reduced.
3305 /// \param NumEltsToRdx The number of elements that should be reduced in the
3306 /// vector.
3307 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3308 /// reduction. A pairwise reduction will generate a mask of
3309 /// <0,2,...> or <1,3,..> while a splitting reduction will generate
3310 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3311 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
createRdxShuffleMask(unsigned VecLen,unsigned NumEltsToRdx,bool IsPairwise,bool IsLeft,IRBuilder<> & Builder)3312 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3313 bool IsPairwise, bool IsLeft,
3314 IRBuilder<> &Builder) {
3315 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3316
3317 SmallVector<Constant *, 32> ShuffleMask(
3318 VecLen, UndefValue::get(Builder.getInt32Ty()));
3319
3320 if (IsPairwise)
3321 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3322 for (unsigned i = 0; i != NumEltsToRdx; ++i)
3323 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3324 else
3325 // Move the upper half of the vector to the lower half.
3326 for (unsigned i = 0; i != NumEltsToRdx; ++i)
3327 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3328
3329 return ConstantVector::get(ShuffleMask);
3330 }
3331
3332
3333 /// Model horizontal reductions.
3334 ///
3335 /// A horizontal reduction is a tree of reduction operations (currently add and
3336 /// fadd) that has operations that can be put into a vector as its leaf.
3337 /// For example, this tree:
3338 ///
3339 /// mul mul mul mul
3340 /// \ / \ /
3341 /// + +
3342 /// \ /
3343 /// +
3344 /// This tree has "mul" as its reduced values and "+" as its reduction
3345 /// operations. A reduction might be feeding into a store or a binary operation
3346 /// feeding a phi.
3347 /// ...
3348 /// \ /
3349 /// +
3350 /// |
3351 /// phi +=
3352 ///
3353 /// Or:
3354 /// ...
3355 /// \ /
3356 /// +
3357 /// |
3358 /// *p =
3359 ///
3360 class HorizontalReduction {
3361 SmallVector<Value *, 16> ReductionOps;
3362 SmallVector<Value *, 32> ReducedVals;
3363
3364 BinaryOperator *ReductionRoot;
3365 PHINode *ReductionPHI;
3366
3367 /// The opcode of the reduction.
3368 unsigned ReductionOpcode;
3369 /// The opcode of the values we perform a reduction on.
3370 unsigned ReducedValueOpcode;
3371 /// The width of one full horizontal reduction operation.
3372 unsigned ReduxWidth;
3373 /// Should we model this reduction as a pairwise reduction tree or a tree that
3374 /// splits the vector in halves and adds those halves.
3375 bool IsPairwiseReduction;
3376
3377 public:
HorizontalReduction()3378 HorizontalReduction()
3379 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3380 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3381
3382 /// \brief Try to find a reduction tree.
matchAssociativeReduction(PHINode * Phi,BinaryOperator * B,const DataLayout * DL)3383 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
3384 const DataLayout *DL) {
3385 assert((!Phi ||
3386 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3387 "Thi phi needs to use the binary operator");
3388
3389 // We could have a initial reductions that is not an add.
3390 // r *= v1 + v2 + v3 + v4
3391 // In such a case start looking for a tree rooted in the first '+'.
3392 if (Phi) {
3393 if (B->getOperand(0) == Phi) {
3394 Phi = nullptr;
3395 B = dyn_cast<BinaryOperator>(B->getOperand(1));
3396 } else if (B->getOperand(1) == Phi) {
3397 Phi = nullptr;
3398 B = dyn_cast<BinaryOperator>(B->getOperand(0));
3399 }
3400 }
3401
3402 if (!B)
3403 return false;
3404
3405 Type *Ty = B->getType();
3406 if (!isValidElementType(Ty))
3407 return false;
3408
3409 ReductionOpcode = B->getOpcode();
3410 ReducedValueOpcode = 0;
3411 ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
3412 ReductionRoot = B;
3413 ReductionPHI = Phi;
3414
3415 if (ReduxWidth < 4)
3416 return false;
3417
3418 // We currently only support adds.
3419 if (ReductionOpcode != Instruction::Add &&
3420 ReductionOpcode != Instruction::FAdd)
3421 return false;
3422
3423 // Post order traverse the reduction tree starting at B. We only handle true
3424 // trees containing only binary operators.
3425 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
3426 Stack.push_back(std::make_pair(B, 0));
3427 while (!Stack.empty()) {
3428 BinaryOperator *TreeN = Stack.back().first;
3429 unsigned EdgeToVist = Stack.back().second++;
3430 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3431
3432 // Only handle trees in the current basic block.
3433 if (TreeN->getParent() != B->getParent())
3434 return false;
3435
3436 // Each tree node needs to have one user except for the ultimate
3437 // reduction.
3438 if (!TreeN->hasOneUse() && TreeN != B)
3439 return false;
3440
3441 // Postorder vist.
3442 if (EdgeToVist == 2 || IsReducedValue) {
3443 if (IsReducedValue) {
3444 // Make sure that the opcodes of the operations that we are going to
3445 // reduce match.
3446 if (!ReducedValueOpcode)
3447 ReducedValueOpcode = TreeN->getOpcode();
3448 else if (ReducedValueOpcode != TreeN->getOpcode())
3449 return false;
3450 ReducedVals.push_back(TreeN);
3451 } else {
3452 // We need to be able to reassociate the adds.
3453 if (!TreeN->isAssociative())
3454 return false;
3455 ReductionOps.push_back(TreeN);
3456 }
3457 // Retract.
3458 Stack.pop_back();
3459 continue;
3460 }
3461
3462 // Visit left or right.
3463 Value *NextV = TreeN->getOperand(EdgeToVist);
3464 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
3465 if (Next)
3466 Stack.push_back(std::make_pair(Next, 0));
3467 else if (NextV != Phi)
3468 return false;
3469 }
3470 return true;
3471 }
3472
3473 /// \brief Attempt to vectorize the tree found by
3474 /// matchAssociativeReduction.
tryToReduce(BoUpSLP & V,TargetTransformInfo * TTI)3475 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3476 if (ReducedVals.empty())
3477 return false;
3478
3479 unsigned NumReducedVals = ReducedVals.size();
3480 if (NumReducedVals < ReduxWidth)
3481 return false;
3482
3483 Value *VectorizedTree = nullptr;
3484 IRBuilder<> Builder(ReductionRoot);
3485 FastMathFlags Unsafe;
3486 Unsafe.setUnsafeAlgebra();
3487 Builder.SetFastMathFlags(Unsafe);
3488 unsigned i = 0;
3489
3490 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3491 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3492
3493 // Estimate cost.
3494 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3495 if (Cost >= -SLPCostThreshold)
3496 break;
3497
3498 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3499 << ". (HorRdx)\n");
3500
3501 // Vectorize a tree.
3502 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3503 Value *VectorizedRoot = V.vectorizeTree();
3504
3505 // Emit a reduction.
3506 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3507 if (VectorizedTree) {
3508 Builder.SetCurrentDebugLocation(Loc);
3509 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3510 ReducedSubTree, "bin.rdx");
3511 } else
3512 VectorizedTree = ReducedSubTree;
3513 }
3514
3515 if (VectorizedTree) {
3516 // Finish the reduction.
3517 for (; i < NumReducedVals; ++i) {
3518 Builder.SetCurrentDebugLocation(
3519 cast<Instruction>(ReducedVals[i])->getDebugLoc());
3520 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3521 ReducedVals[i]);
3522 }
3523 // Update users.
3524 if (ReductionPHI) {
3525 assert(ReductionRoot && "Need a reduction operation");
3526 ReductionRoot->setOperand(0, VectorizedTree);
3527 ReductionRoot->setOperand(1, ReductionPHI);
3528 } else
3529 ReductionRoot->replaceAllUsesWith(VectorizedTree);
3530 }
3531 return VectorizedTree != nullptr;
3532 }
3533
3534 private:
3535
3536 /// \brief Calcuate the cost of a reduction.
getReductionCost(TargetTransformInfo * TTI,Value * FirstReducedVal)3537 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3538 Type *ScalarTy = FirstReducedVal->getType();
3539 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3540
3541 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3542 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3543
3544 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3545 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3546
3547 int ScalarReduxCost =
3548 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3549
3550 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3551 << " for reduction that starts with " << *FirstReducedVal
3552 << " (It is a "
3553 << (IsPairwiseReduction ? "pairwise" : "splitting")
3554 << " reduction)\n");
3555
3556 return VecReduxCost - ScalarReduxCost;
3557 }
3558
createBinOp(IRBuilder<> & Builder,unsigned Opcode,Value * L,Value * R,const Twine & Name="")3559 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3560 Value *R, const Twine &Name = "") {
3561 if (Opcode == Instruction::FAdd)
3562 return Builder.CreateFAdd(L, R, Name);
3563 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3564 }
3565
3566 /// \brief Emit a horizontal reduction of the vectorized value.
emitReduction(Value * VectorizedValue,IRBuilder<> & Builder)3567 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3568 assert(VectorizedValue && "Need to have a vectorized tree node");
3569 assert(isPowerOf2_32(ReduxWidth) &&
3570 "We only handle power-of-two reductions for now");
3571
3572 Value *TmpVec = VectorizedValue;
3573 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3574 if (IsPairwiseReduction) {
3575 Value *LeftMask =
3576 createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3577 Value *RightMask =
3578 createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3579
3580 Value *LeftShuf = Builder.CreateShuffleVector(
3581 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3582 Value *RightShuf = Builder.CreateShuffleVector(
3583 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3584 "rdx.shuf.r");
3585 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3586 "bin.rdx");
3587 } else {
3588 Value *UpperHalf =
3589 createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3590 Value *Shuf = Builder.CreateShuffleVector(
3591 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3592 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3593 }
3594 }
3595
3596 // The result is in the first element of the vector.
3597 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3598 }
3599 };
3600
3601 /// \brief Recognize construction of vectors like
3602 /// %ra = insertelement <4 x float> undef, float %s0, i32 0
3603 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
3604 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
3605 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
3606 ///
3607 /// Returns true if it matches
3608 ///
findBuildVector(InsertElementInst * FirstInsertElem,SmallVectorImpl<Value * > & BuildVector,SmallVectorImpl<Value * > & BuildVectorOpds)3609 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3610 SmallVectorImpl<Value *> &BuildVector,
3611 SmallVectorImpl<Value *> &BuildVectorOpds) {
3612 if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3613 return false;
3614
3615 InsertElementInst *IE = FirstInsertElem;
3616 while (true) {
3617 BuildVector.push_back(IE);
3618 BuildVectorOpds.push_back(IE->getOperand(1));
3619
3620 if (IE->use_empty())
3621 return false;
3622
3623 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
3624 if (!NextUse)
3625 return true;
3626
3627 // If this isn't the final use, make sure the next insertelement is the only
3628 // use. It's OK if the final constructed vector is used multiple times
3629 if (!IE->hasOneUse())
3630 return false;
3631
3632 IE = NextUse;
3633 }
3634
3635 return false;
3636 }
3637
PhiTypeSorterFunc(Value * V,Value * V2)3638 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3639 return V->getType() < V2->getType();
3640 }
3641
vectorizeChainsInBlock(BasicBlock * BB,BoUpSLP & R)3642 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
3643 bool Changed = false;
3644 SmallVector<Value *, 4> Incoming;
3645 SmallSet<Value *, 16> VisitedInstrs;
3646
3647 bool HaveVectorizedPhiNodes = true;
3648 while (HaveVectorizedPhiNodes) {
3649 HaveVectorizedPhiNodes = false;
3650
3651 // Collect the incoming values from the PHIs.
3652 Incoming.clear();
3653 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
3654 ++instr) {
3655 PHINode *P = dyn_cast<PHINode>(instr);
3656 if (!P)
3657 break;
3658
3659 if (!VisitedInstrs.count(P))
3660 Incoming.push_back(P);
3661 }
3662
3663 // Sort by type.
3664 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
3665
3666 // Try to vectorize elements base on their type.
3667 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
3668 E = Incoming.end();
3669 IncIt != E;) {
3670
3671 // Look for the next elements with the same type.
3672 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
3673 while (SameTypeIt != E &&
3674 (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3675 VisitedInstrs.insert(*SameTypeIt);
3676 ++SameTypeIt;
3677 }
3678
3679 // Try to vectorize them.
3680 unsigned NumElts = (SameTypeIt - IncIt);
3681 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
3682 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
3683 // Success start over because instructions might have been changed.
3684 HaveVectorizedPhiNodes = true;
3685 Changed = true;
3686 break;
3687 }
3688
3689 // Start over at the next instruction of a different type (or the end).
3690 IncIt = SameTypeIt;
3691 }
3692 }
3693
3694 VisitedInstrs.clear();
3695
3696 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
3697 // We may go through BB multiple times so skip the one we have checked.
3698 if (!VisitedInstrs.insert(it).second)
3699 continue;
3700
3701 if (isa<DbgInfoIntrinsic>(it))
3702 continue;
3703
3704 // Try to vectorize reductions that use PHINodes.
3705 if (PHINode *P = dyn_cast<PHINode>(it)) {
3706 // Check that the PHI is a reduction PHI.
3707 if (P->getNumIncomingValues() != 2)
3708 return Changed;
3709 Value *Rdx =
3710 (P->getIncomingBlock(0) == BB
3711 ? (P->getIncomingValue(0))
3712 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
3713 : nullptr));
3714 // Check if this is a Binary Operator.
3715 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
3716 if (!BI)
3717 continue;
3718
3719 // Try to match and vectorize a horizontal reduction.
3720 HorizontalReduction HorRdx;
3721 if (ShouldVectorizeHor &&
3722 HorRdx.matchAssociativeReduction(P, BI, DL) &&
3723 HorRdx.tryToReduce(R, TTI)) {
3724 Changed = true;
3725 it = BB->begin();
3726 e = BB->end();
3727 continue;
3728 }
3729
3730 Value *Inst = BI->getOperand(0);
3731 if (Inst == P)
3732 Inst = BI->getOperand(1);
3733
3734 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3735 // We would like to start over since some instructions are deleted
3736 // and the iterator may become invalid value.
3737 Changed = true;
3738 it = BB->begin();
3739 e = BB->end();
3740 continue;
3741 }
3742
3743 continue;
3744 }
3745
3746 // Try to vectorize horizontal reductions feeding into a store.
3747 if (ShouldStartVectorizeHorAtStore)
3748 if (StoreInst *SI = dyn_cast<StoreInst>(it))
3749 if (BinaryOperator *BinOp =
3750 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
3751 HorizontalReduction HorRdx;
3752 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
3753 HorRdx.tryToReduce(R, TTI)) ||
3754 tryToVectorize(BinOp, R))) {
3755 Changed = true;
3756 it = BB->begin();
3757 e = BB->end();
3758 continue;
3759 }
3760 }
3761
3762 // Try to vectorize horizontal reductions feeding into a return.
3763 if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
3764 if (RI->getNumOperands() != 0)
3765 if (BinaryOperator *BinOp =
3766 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
3767 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
3768 if (tryToVectorizePair(BinOp->getOperand(0),
3769 BinOp->getOperand(1), R)) {
3770 Changed = true;
3771 it = BB->begin();
3772 e = BB->end();
3773 continue;
3774 }
3775 }
3776
3777 // Try to vectorize trees that start at compare instructions.
3778 if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
3779 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
3780 Changed = true;
3781 // We would like to start over since some instructions are deleted
3782 // and the iterator may become invalid value.
3783 it = BB->begin();
3784 e = BB->end();
3785 continue;
3786 }
3787
3788 for (int i = 0; i < 2; ++i) {
3789 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
3790 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
3791 Changed = true;
3792 // We would like to start over since some instructions are deleted
3793 // and the iterator may become invalid value.
3794 it = BB->begin();
3795 e = BB->end();
3796 }
3797 }
3798 }
3799 continue;
3800 }
3801
3802 // Try to vectorize trees that start at insertelement instructions.
3803 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
3804 SmallVector<Value *, 16> BuildVector;
3805 SmallVector<Value *, 16> BuildVectorOpds;
3806 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
3807 continue;
3808
3809 // Vectorize starting with the build vector operands ignoring the
3810 // BuildVector instructions for the purpose of scheduling and user
3811 // extraction.
3812 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
3813 Changed = true;
3814 it = BB->begin();
3815 e = BB->end();
3816 }
3817
3818 continue;
3819 }
3820 }
3821
3822 return Changed;
3823 }
3824
vectorizeStoreChains(BoUpSLP & R)3825 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
3826 bool Changed = false;
3827 // Attempt to sort and vectorize each of the store-groups.
3828 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
3829 it != e; ++it) {
3830 if (it->second.size() < 2)
3831 continue;
3832
3833 DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
3834 << it->second.size() << ".\n");
3835
3836 // Process the stores in chunks of 16.
3837 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
3838 unsigned Len = std::min<unsigned>(CE - CI, 16);
3839 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
3840 -SLPCostThreshold, R);
3841 }
3842 }
3843 return Changed;
3844 }
3845
3846 } // end anonymous namespace
3847
3848 char SLPVectorizer::ID = 0;
3849 static const char lv_name[] = "SLP Vectorizer";
3850 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
3851 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
3852 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
3853 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
3854 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
3855 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
3856 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
3857
3858 namespace llvm {
createSLPVectorizerPass()3859 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
3860 }
3861