xref: /minix3/external/bsd/llvm/dist/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 0a6a1f1d05b60e214de2f05a7310ddd1f0e590e7)
1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 #include "llvm/Transforms/Vectorize.h"
19 #include "llvm/ADT/MapVector.h"
20 #include "llvm/ADT/PostOrderIterator.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/ADT/Optional.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/AliasAnalysis.h"
25 #include "llvm/Analysis/AssumptionCache.h"
26 #include "llvm/Analysis/CodeMetrics.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/Analysis/ScalarEvolution.h"
29 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/Analysis/ValueTracking.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/NoFolder.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/IR/Verifier.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include "llvm/Transforms/Utils/VectorUtils.h"
47 #include <algorithm>
48 #include <map>
49 #include <memory>
50 
51 using namespace llvm;
52 
53 #define SV_NAME "slp-vectorizer"
54 #define DEBUG_TYPE "SLP"
55 
56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
57 
58 static cl::opt<int>
59     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
60                      cl::desc("Only vectorize if you gain more than this "
61                               "number "));
62 
63 static cl::opt<bool>
64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
65                    cl::desc("Attempt to vectorize horizontal reductions"));
66 
67 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
68     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
69     cl::desc(
70         "Attempt to vectorize horizontal reductions feeding into a store"));
71 
72 namespace {
73 
74 static const unsigned MinVecRegSize = 128;
75 
76 static const unsigned RecursionMaxDepth = 12;
77 
78 /// \brief Predicate for the element types that the SLP vectorizer supports.
79 ///
80 /// The most important thing to filter here are types which are invalid in LLVM
81 /// vectors. We also filter target specific types which have absolutely no
82 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
83 /// avoids spending time checking the cost model and realizing that they will
84 /// be inevitably scalarized.
isValidElementType(Type * Ty)85 static bool isValidElementType(Type *Ty) {
86   return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
87          !Ty->isPPC_FP128Ty();
88 }
89 
90 /// \returns the parent basic block if all of the instructions in \p VL
91 /// are in the same block or null otherwise.
getSameBlock(ArrayRef<Value * > VL)92 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
93   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
94   if (!I0)
95     return nullptr;
96   BasicBlock *BB = I0->getParent();
97   for (int i = 1, e = VL.size(); i < e; i++) {
98     Instruction *I = dyn_cast<Instruction>(VL[i]);
99     if (!I)
100       return nullptr;
101 
102     if (BB != I->getParent())
103       return nullptr;
104   }
105   return BB;
106 }
107 
108 /// \returns True if all of the values in \p VL are constants.
allConstant(ArrayRef<Value * > VL)109 static bool allConstant(ArrayRef<Value *> VL) {
110   for (unsigned i = 0, e = VL.size(); i < e; ++i)
111     if (!isa<Constant>(VL[i]))
112       return false;
113   return true;
114 }
115 
116 /// \returns True if all of the values in \p VL are identical.
isSplat(ArrayRef<Value * > VL)117 static bool isSplat(ArrayRef<Value *> VL) {
118   for (unsigned i = 1, e = VL.size(); i < e; ++i)
119     if (VL[i] != VL[0])
120       return false;
121   return true;
122 }
123 
124 ///\returns Opcode that can be clubbed with \p Op to create an alternate
125 /// sequence which can later be merged as a ShuffleVector instruction.
getAltOpcode(unsigned Op)126 static unsigned getAltOpcode(unsigned Op) {
127   switch (Op) {
128   case Instruction::FAdd:
129     return Instruction::FSub;
130   case Instruction::FSub:
131     return Instruction::FAdd;
132   case Instruction::Add:
133     return Instruction::Sub;
134   case Instruction::Sub:
135     return Instruction::Add;
136   default:
137     return 0;
138   }
139 }
140 
141 ///\returns bool representing if Opcode \p Op can be part
142 /// of an alternate sequence which can later be merged as
143 /// a ShuffleVector instruction.
canCombineAsAltInst(unsigned Op)144 static bool canCombineAsAltInst(unsigned Op) {
145   if (Op == Instruction::FAdd || Op == Instruction::FSub ||
146       Op == Instruction::Sub || Op == Instruction::Add)
147     return true;
148   return false;
149 }
150 
151 /// \returns ShuffleVector instruction if intructions in \p VL have
152 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
153 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
isAltInst(ArrayRef<Value * > VL)154 static unsigned isAltInst(ArrayRef<Value *> VL) {
155   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
156   unsigned Opcode = I0->getOpcode();
157   unsigned AltOpcode = getAltOpcode(Opcode);
158   for (int i = 1, e = VL.size(); i < e; i++) {
159     Instruction *I = dyn_cast<Instruction>(VL[i]);
160     if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
161       return 0;
162   }
163   return Instruction::ShuffleVector;
164 }
165 
166 /// \returns The opcode if all of the Instructions in \p VL have the same
167 /// opcode, or zero.
getSameOpcode(ArrayRef<Value * > VL)168 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
169   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
170   if (!I0)
171     return 0;
172   unsigned Opcode = I0->getOpcode();
173   for (int i = 1, e = VL.size(); i < e; i++) {
174     Instruction *I = dyn_cast<Instruction>(VL[i]);
175     if (!I || Opcode != I->getOpcode()) {
176       if (canCombineAsAltInst(Opcode) && i == 1)
177         return isAltInst(VL);
178       return 0;
179     }
180   }
181   return Opcode;
182 }
183 
184 /// Get the intersection (logical and) of all of the potential IR flags
185 /// of each scalar operation (VL) that will be converted into a vector (I).
186 /// Flag set: NSW, NUW, exact, and all of fast-math.
propagateIRFlags(Value * I,ArrayRef<Value * > VL)187 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
188   if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
189     if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
190       // Intersection is initialized to the 0th scalar,
191       // so start counting from index '1'.
192       for (int i = 1, e = VL.size(); i < e; ++i) {
193         if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
194           Intersection->andIRFlags(Scalar);
195       }
196       VecOp->copyIRFlags(Intersection);
197     }
198   }
199 }
200 
201 /// \returns \p I after propagating metadata from \p VL.
propagateMetadata(Instruction * I,ArrayRef<Value * > VL)202 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
203   Instruction *I0 = cast<Instruction>(VL[0]);
204   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
205   I0->getAllMetadataOtherThanDebugLoc(Metadata);
206 
207   for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
208     unsigned Kind = Metadata[i].first;
209     MDNode *MD = Metadata[i].second;
210 
211     for (int i = 1, e = VL.size(); MD && i != e; i++) {
212       Instruction *I = cast<Instruction>(VL[i]);
213       MDNode *IMD = I->getMetadata(Kind);
214 
215       switch (Kind) {
216       default:
217         MD = nullptr; // Remove unknown metadata
218         break;
219       case LLVMContext::MD_tbaa:
220         MD = MDNode::getMostGenericTBAA(MD, IMD);
221         break;
222       case LLVMContext::MD_alias_scope:
223         MD = MDNode::getMostGenericAliasScope(MD, IMD);
224         break;
225       case LLVMContext::MD_noalias:
226         MD = MDNode::intersect(MD, IMD);
227         break;
228       case LLVMContext::MD_fpmath:
229         MD = MDNode::getMostGenericFPMath(MD, IMD);
230         break;
231       }
232     }
233     I->setMetadata(Kind, MD);
234   }
235   return I;
236 }
237 
238 /// \returns The type that all of the values in \p VL have or null if there
239 /// are different types.
getSameType(ArrayRef<Value * > VL)240 static Type* getSameType(ArrayRef<Value *> VL) {
241   Type *Ty = VL[0]->getType();
242   for (int i = 1, e = VL.size(); i < e; i++)
243     if (VL[i]->getType() != Ty)
244       return nullptr;
245 
246   return Ty;
247 }
248 
249 /// \returns True if the ExtractElement instructions in VL can be vectorized
250 /// to use the original vector.
CanReuseExtract(ArrayRef<Value * > VL)251 static bool CanReuseExtract(ArrayRef<Value *> VL) {
252   assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
253   // Check if all of the extracts come from the same vector and from the
254   // correct offset.
255   Value *VL0 = VL[0];
256   ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
257   Value *Vec = E0->getOperand(0);
258 
259   // We have to extract from the same vector type.
260   unsigned NElts = Vec->getType()->getVectorNumElements();
261 
262   if (NElts != VL.size())
263     return false;
264 
265   // Check that all of the indices extract from the correct offset.
266   ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
267   if (!CI || CI->getZExtValue())
268     return false;
269 
270   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
271     ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
272     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
273 
274     if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
275       return false;
276   }
277 
278   return true;
279 }
280 
reorderInputsAccordingToOpcode(ArrayRef<Value * > VL,SmallVectorImpl<Value * > & Left,SmallVectorImpl<Value * > & Right)281 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
282                                            SmallVectorImpl<Value *> &Left,
283                                            SmallVectorImpl<Value *> &Right) {
284 
285   SmallVector<Value *, 16> OrigLeft, OrigRight;
286 
287   bool AllSameOpcodeLeft = true;
288   bool AllSameOpcodeRight = true;
289   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
290     Instruction *I = cast<Instruction>(VL[i]);
291     Value *V0 = I->getOperand(0);
292     Value *V1 = I->getOperand(1);
293 
294     OrigLeft.push_back(V0);
295     OrigRight.push_back(V1);
296 
297     Instruction *I0 = dyn_cast<Instruction>(V0);
298     Instruction *I1 = dyn_cast<Instruction>(V1);
299 
300     // Check whether all operands on one side have the same opcode. In this case
301     // we want to preserve the original order and not make things worse by
302     // reordering.
303     AllSameOpcodeLeft = I0;
304     AllSameOpcodeRight = I1;
305 
306     if (i && AllSameOpcodeLeft) {
307       if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
308         if(P0->getOpcode() != I0->getOpcode())
309           AllSameOpcodeLeft = false;
310       } else
311         AllSameOpcodeLeft = false;
312     }
313     if (i && AllSameOpcodeRight) {
314       if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
315         if(P1->getOpcode() != I1->getOpcode())
316           AllSameOpcodeRight = false;
317       } else
318         AllSameOpcodeRight = false;
319     }
320 
321     // Sort two opcodes. In the code below we try to preserve the ability to use
322     // broadcast of values instead of individual inserts.
323     // vl1 = load
324     // vl2 = phi
325     // vr1 = load
326     // vr2 = vr2
327     //    = vl1 x vr1
328     //    = vl2 x vr2
329     // If we just sorted according to opcode we would leave the first line in
330     // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
331     //    = vl1 x vr1
332     //    = vr2 x vl2
333     // Because vr2 and vr1 are from the same load we loose the opportunity of a
334     // broadcast for the packed right side in the backend: we have [vr1, vl2]
335     // instead of [vr1, vr2=vr1].
336     if (I0 && I1) {
337        if(!i && I0->getOpcode() > I1->getOpcode()) {
338          Left.push_back(I1);
339          Right.push_back(I0);
340        } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
341          // Try not to destroy a broad cast for no apparent benefit.
342          Left.push_back(I1);
343          Right.push_back(I0);
344        } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
345          // Try preserve broadcasts.
346          Left.push_back(I1);
347          Right.push_back(I0);
348        } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
349          // Try preserve broadcasts.
350          Left.push_back(I1);
351          Right.push_back(I0);
352        } else {
353          Left.push_back(I0);
354          Right.push_back(I1);
355        }
356        continue;
357     }
358     // One opcode, put the instruction on the right.
359     if (I0) {
360       Left.push_back(V1);
361       Right.push_back(I0);
362       continue;
363     }
364     Left.push_back(V0);
365     Right.push_back(V1);
366   }
367 
368   bool LeftBroadcast = isSplat(Left);
369   bool RightBroadcast = isSplat(Right);
370 
371   // Don't reorder if the operands where good to begin with.
372   if (!(LeftBroadcast || RightBroadcast) &&
373       (AllSameOpcodeRight || AllSameOpcodeLeft)) {
374     Left = OrigLeft;
375     Right = OrigRight;
376   }
377 }
378 
379 /// \returns True if in-tree use also needs extract. This refers to
380 /// possible scalar operand in vectorized instruction.
InTreeUserNeedToExtract(Value * Scalar,Instruction * UserInst,TargetLibraryInfo * TLI)381 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
382                                     TargetLibraryInfo *TLI) {
383 
384   unsigned Opcode = UserInst->getOpcode();
385   switch (Opcode) {
386   case Instruction::Load: {
387     LoadInst *LI = cast<LoadInst>(UserInst);
388     return (LI->getPointerOperand() == Scalar);
389   }
390   case Instruction::Store: {
391     StoreInst *SI = cast<StoreInst>(UserInst);
392     return (SI->getPointerOperand() == Scalar);
393   }
394   case Instruction::Call: {
395     CallInst *CI = cast<CallInst>(UserInst);
396     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
397     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
398       return (CI->getArgOperand(1) == Scalar);
399     }
400   }
401   default:
402     return false;
403   }
404 }
405 
406 /// \returns the AA location that is being access by the instruction.
getLocation(Instruction * I,AliasAnalysis * AA)407 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
408   if (StoreInst *SI = dyn_cast<StoreInst>(I))
409     return AA->getLocation(SI);
410   if (LoadInst *LI = dyn_cast<LoadInst>(I))
411     return AA->getLocation(LI);
412   return AliasAnalysis::Location();
413 }
414 
415 /// Bottom Up SLP Vectorizer.
416 class BoUpSLP {
417 public:
418   typedef SmallVector<Value *, 8> ValueList;
419   typedef SmallVector<Instruction *, 16> InstrList;
420   typedef SmallPtrSet<Value *, 16> ValueSet;
421   typedef SmallVector<StoreInst *, 8> StoreList;
422 
BoUpSLP(Function * Func,ScalarEvolution * Se,const DataLayout * Dl,TargetTransformInfo * Tti,TargetLibraryInfo * TLi,AliasAnalysis * Aa,LoopInfo * Li,DominatorTree * Dt,AssumptionCache * AC)423   BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
424           TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
425           LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC)
426       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
427         SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
428         Builder(Se->getContext()) {
429     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
430   }
431 
432   /// \brief Vectorize the tree that starts with the elements in \p VL.
433   /// Returns the vectorized root.
434   Value *vectorizeTree();
435 
436   /// \returns the cost incurred by unwanted spills and fills, caused by
437   /// holding live values over call sites.
438   int getSpillCost();
439 
440   /// \returns the vectorization cost of the subtree that starts at \p VL.
441   /// A negative number means that this is profitable.
442   int getTreeCost();
443 
444   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
445   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
446   void buildTree(ArrayRef<Value *> Roots,
447                  ArrayRef<Value *> UserIgnoreLst = None);
448 
449   /// Clear the internal data structures that are created by 'buildTree'.
deleteTree()450   void deleteTree() {
451     VectorizableTree.clear();
452     ScalarToTreeEntry.clear();
453     MustGather.clear();
454     ExternalUses.clear();
455     NumLoadsWantToKeepOrder = 0;
456     NumLoadsWantToChangeOrder = 0;
457     for (auto &Iter : BlocksSchedules) {
458       BlockScheduling *BS = Iter.second.get();
459       BS->clear();
460     }
461   }
462 
463   /// \returns true if the memory operations A and B are consecutive.
464   bool isConsecutiveAccess(Value *A, Value *B);
465 
466   /// \brief Perform LICM and CSE on the newly generated gather sequences.
467   void optimizeGatherSequence();
468 
469   /// \returns true if it is benefitial to reverse the vector order.
shouldReorder() const470   bool shouldReorder() const {
471     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
472   }
473 
474 private:
475   struct TreeEntry;
476 
477   /// \returns the cost of the vectorizable entry.
478   int getEntryCost(TreeEntry *E);
479 
480   /// This is the recursive part of buildTree.
481   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
482 
483   /// Vectorize a single entry in the tree.
484   Value *vectorizeTree(TreeEntry *E);
485 
486   /// Vectorize a single entry in the tree, starting in \p VL.
487   Value *vectorizeTree(ArrayRef<Value *> VL);
488 
489   /// \returns the pointer to the vectorized value if \p VL is already
490   /// vectorized, or NULL. They may happen in cycles.
491   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
492 
493   /// \brief Take the pointer operand from the Load/Store instruction.
494   /// \returns NULL if this is not a valid Load/Store instruction.
495   static Value *getPointerOperand(Value *I);
496 
497   /// \brief Take the address space operand from the Load/Store instruction.
498   /// \returns -1 if this is not a valid Load/Store instruction.
499   static unsigned getAddressSpaceOperand(Value *I);
500 
501   /// \returns the scalarization cost for this type. Scalarization in this
502   /// context means the creation of vectors from a group of scalars.
503   int getGatherCost(Type *Ty);
504 
505   /// \returns the scalarization cost for this list of values. Assuming that
506   /// this subtree gets vectorized, we may need to extract the values from the
507   /// roots. This method calculates the cost of extracting the values.
508   int getGatherCost(ArrayRef<Value *> VL);
509 
510   /// \brief Set the Builder insert point to one after the last instruction in
511   /// the bundle
512   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
513 
514   /// \returns a vector from a collection of scalars in \p VL.
515   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
516 
517   /// \returns whether the VectorizableTree is fully vectoriable and will
518   /// be beneficial even the tree height is tiny.
519   bool isFullyVectorizableTinyTree();
520 
521   struct TreeEntry {
TreeEntry__anon6868a6e70111::BoUpSLP::TreeEntry522     TreeEntry() : Scalars(), VectorizedValue(nullptr),
523     NeedToGather(0) {}
524 
525     /// \returns true if the scalars in VL are equal to this entry.
isSame__anon6868a6e70111::BoUpSLP::TreeEntry526     bool isSame(ArrayRef<Value *> VL) const {
527       assert(VL.size() == Scalars.size() && "Invalid size");
528       return std::equal(VL.begin(), VL.end(), Scalars.begin());
529     }
530 
531     /// A vector of scalars.
532     ValueList Scalars;
533 
534     /// The Scalars are vectorized into this value. It is initialized to Null.
535     Value *VectorizedValue;
536 
537     /// Do we need to gather this sequence ?
538     bool NeedToGather;
539   };
540 
541   /// Create a new VectorizableTree entry.
newTreeEntry(ArrayRef<Value * > VL,bool Vectorized)542   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
543     VectorizableTree.push_back(TreeEntry());
544     int idx = VectorizableTree.size() - 1;
545     TreeEntry *Last = &VectorizableTree[idx];
546     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
547     Last->NeedToGather = !Vectorized;
548     if (Vectorized) {
549       for (int i = 0, e = VL.size(); i != e; ++i) {
550         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
551         ScalarToTreeEntry[VL[i]] = idx;
552       }
553     } else {
554       MustGather.insert(VL.begin(), VL.end());
555     }
556     return Last;
557   }
558 
559   /// -- Vectorization State --
560   /// Holds all of the tree entries.
561   std::vector<TreeEntry> VectorizableTree;
562 
563   /// Maps a specific scalar to its tree entry.
564   SmallDenseMap<Value*, int> ScalarToTreeEntry;
565 
566   /// A list of scalars that we found that we need to keep as scalars.
567   ValueSet MustGather;
568 
569   /// This POD struct describes one external user in the vectorized tree.
570   struct ExternalUser {
ExternalUser__anon6868a6e70111::BoUpSLP::ExternalUser571     ExternalUser (Value *S, llvm::User *U, int L) :
572       Scalar(S), User(U), Lane(L){};
573     // Which scalar in our function.
574     Value *Scalar;
575     // Which user that uses the scalar.
576     llvm::User *User;
577     // Which lane does the scalar belong to.
578     int Lane;
579   };
580   typedef SmallVector<ExternalUser, 16> UserList;
581 
582   /// Checks if two instructions may access the same memory.
583   ///
584   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
585   /// is invariant in the calling loop.
isAliased(const AliasAnalysis::Location & Loc1,Instruction * Inst1,Instruction * Inst2)586   bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1,
587                  Instruction *Inst2) {
588 
589     // First check if the result is already in the cache.
590     AliasCacheKey key = std::make_pair(Inst1, Inst2);
591     Optional<bool> &result = AliasCache[key];
592     if (result.hasValue()) {
593       return result.getValue();
594     }
595     AliasAnalysis::Location Loc2 = getLocation(Inst2, AA);
596     bool aliased = true;
597     if (Loc1.Ptr && Loc2.Ptr) {
598       // Do the alias check.
599       aliased = AA->alias(Loc1, Loc2);
600     }
601     // Store the result in the cache.
602     result = aliased;
603     return aliased;
604   }
605 
606   typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
607 
608   /// Cache for alias results.
609   /// TODO: consider moving this to the AliasAnalysis itself.
610   DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
611 
612   /// Removes an instruction from its block and eventually deletes it.
613   /// It's like Instruction::eraseFromParent() except that the actual deletion
614   /// is delayed until BoUpSLP is destructed.
615   /// This is required to ensure that there are no incorrect collisions in the
616   /// AliasCache, which can happen if a new instruction is allocated at the
617   /// same address as a previously deleted instruction.
eraseInstruction(Instruction * I)618   void eraseInstruction(Instruction *I) {
619     I->removeFromParent();
620     I->dropAllReferences();
621     DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
622   }
623 
624   /// Temporary store for deleted instructions. Instructions will be deleted
625   /// eventually when the BoUpSLP is destructed.
626   SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
627 
628   /// A list of values that need to extracted out of the tree.
629   /// This list holds pairs of (Internal Scalar : External User).
630   UserList ExternalUses;
631 
632   /// Values used only by @llvm.assume calls.
633   SmallPtrSet<const Value *, 32> EphValues;
634 
635   /// Holds all of the instructions that we gathered.
636   SetVector<Instruction *> GatherSeq;
637   /// A list of blocks that we are going to CSE.
638   SetVector<BasicBlock *> CSEBlocks;
639 
640   /// Contains all scheduling relevant data for an instruction.
641   /// A ScheduleData either represents a single instruction or a member of an
642   /// instruction bundle (= a group of instructions which is combined into a
643   /// vector instruction).
644   struct ScheduleData {
645 
646     // The initial value for the dependency counters. It means that the
647     // dependencies are not calculated yet.
648     enum { InvalidDeps = -1 };
649 
ScheduleData__anon6868a6e70111::BoUpSLP::ScheduleData650     ScheduleData()
651         : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
652           NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
653           Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
654           UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
655 
init__anon6868a6e70111::BoUpSLP::ScheduleData656     void init(int BlockSchedulingRegionID) {
657       FirstInBundle = this;
658       NextInBundle = nullptr;
659       NextLoadStore = nullptr;
660       IsScheduled = false;
661       SchedulingRegionID = BlockSchedulingRegionID;
662       UnscheduledDepsInBundle = UnscheduledDeps;
663       clearDependencies();
664     }
665 
666     /// Returns true if the dependency information has been calculated.
hasValidDependencies__anon6868a6e70111::BoUpSLP::ScheduleData667     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
668 
669     /// Returns true for single instructions and for bundle representatives
670     /// (= the head of a bundle).
isSchedulingEntity__anon6868a6e70111::BoUpSLP::ScheduleData671     bool isSchedulingEntity() const { return FirstInBundle == this; }
672 
673     /// Returns true if it represents an instruction bundle and not only a
674     /// single instruction.
isPartOfBundle__anon6868a6e70111::BoUpSLP::ScheduleData675     bool isPartOfBundle() const {
676       return NextInBundle != nullptr || FirstInBundle != this;
677     }
678 
679     /// Returns true if it is ready for scheduling, i.e. it has no more
680     /// unscheduled depending instructions/bundles.
isReady__anon6868a6e70111::BoUpSLP::ScheduleData681     bool isReady() const {
682       assert(isSchedulingEntity() &&
683              "can't consider non-scheduling entity for ready list");
684       return UnscheduledDepsInBundle == 0 && !IsScheduled;
685     }
686 
687     /// Modifies the number of unscheduled dependencies, also updating it for
688     /// the whole bundle.
incrementUnscheduledDeps__anon6868a6e70111::BoUpSLP::ScheduleData689     int incrementUnscheduledDeps(int Incr) {
690       UnscheduledDeps += Incr;
691       return FirstInBundle->UnscheduledDepsInBundle += Incr;
692     }
693 
694     /// Sets the number of unscheduled dependencies to the number of
695     /// dependencies.
resetUnscheduledDeps__anon6868a6e70111::BoUpSLP::ScheduleData696     void resetUnscheduledDeps() {
697       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
698     }
699 
700     /// Clears all dependency information.
clearDependencies__anon6868a6e70111::BoUpSLP::ScheduleData701     void clearDependencies() {
702       Dependencies = InvalidDeps;
703       resetUnscheduledDeps();
704       MemoryDependencies.clear();
705     }
706 
dump__anon6868a6e70111::BoUpSLP::ScheduleData707     void dump(raw_ostream &os) const {
708       if (!isSchedulingEntity()) {
709         os << "/ " << *Inst;
710       } else if (NextInBundle) {
711         os << '[' << *Inst;
712         ScheduleData *SD = NextInBundle;
713         while (SD) {
714           os << ';' << *SD->Inst;
715           SD = SD->NextInBundle;
716         }
717         os << ']';
718       } else {
719         os << *Inst;
720       }
721     }
722 
723     Instruction *Inst;
724 
725     /// Points to the head in an instruction bundle (and always to this for
726     /// single instructions).
727     ScheduleData *FirstInBundle;
728 
729     /// Single linked list of all instructions in a bundle. Null if it is a
730     /// single instruction.
731     ScheduleData *NextInBundle;
732 
733     /// Single linked list of all memory instructions (e.g. load, store, call)
734     /// in the block - until the end of the scheduling region.
735     ScheduleData *NextLoadStore;
736 
737     /// The dependent memory instructions.
738     /// This list is derived on demand in calculateDependencies().
739     SmallVector<ScheduleData *, 4> MemoryDependencies;
740 
741     /// This ScheduleData is in the current scheduling region if this matches
742     /// the current SchedulingRegionID of BlockScheduling.
743     int SchedulingRegionID;
744 
745     /// Used for getting a "good" final ordering of instructions.
746     int SchedulingPriority;
747 
748     /// The number of dependencies. Constitutes of the number of users of the
749     /// instruction plus the number of dependent memory instructions (if any).
750     /// This value is calculated on demand.
751     /// If InvalidDeps, the number of dependencies is not calculated yet.
752     ///
753     int Dependencies;
754 
755     /// The number of dependencies minus the number of dependencies of scheduled
756     /// instructions. As soon as this is zero, the instruction/bundle gets ready
757     /// for scheduling.
758     /// Note that this is negative as long as Dependencies is not calculated.
759     int UnscheduledDeps;
760 
761     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
762     /// single instructions.
763     int UnscheduledDepsInBundle;
764 
765     /// True if this instruction is scheduled (or considered as scheduled in the
766     /// dry-run).
767     bool IsScheduled;
768   };
769 
770 #ifndef NDEBUG
771   friend raw_ostream &operator<<(raw_ostream &os,
772                                  const BoUpSLP::ScheduleData &SD);
773 #endif
774 
775   /// Contains all scheduling data for a basic block.
776   ///
777   struct BlockScheduling {
778 
BlockScheduling__anon6868a6e70111::BoUpSLP::BlockScheduling779     BlockScheduling(BasicBlock *BB)
780         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
781           ScheduleStart(nullptr), ScheduleEnd(nullptr),
782           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
783           // Make sure that the initial SchedulingRegionID is greater than the
784           // initial SchedulingRegionID in ScheduleData (which is 0).
785           SchedulingRegionID(1) {}
786 
clear__anon6868a6e70111::BoUpSLP::BlockScheduling787     void clear() {
788       ReadyInsts.clear();
789       ScheduleStart = nullptr;
790       ScheduleEnd = nullptr;
791       FirstLoadStoreInRegion = nullptr;
792       LastLoadStoreInRegion = nullptr;
793 
794       // Make a new scheduling region, i.e. all existing ScheduleData is not
795       // in the new region yet.
796       ++SchedulingRegionID;
797     }
798 
getScheduleData__anon6868a6e70111::BoUpSLP::BlockScheduling799     ScheduleData *getScheduleData(Value *V) {
800       ScheduleData *SD = ScheduleDataMap[V];
801       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
802         return SD;
803       return nullptr;
804     }
805 
isInSchedulingRegion__anon6868a6e70111::BoUpSLP::BlockScheduling806     bool isInSchedulingRegion(ScheduleData *SD) {
807       return SD->SchedulingRegionID == SchedulingRegionID;
808     }
809 
810     /// Marks an instruction as scheduled and puts all dependent ready
811     /// instructions into the ready-list.
812     template <typename ReadyListType>
schedule__anon6868a6e70111::BoUpSLP::BlockScheduling813     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
814       SD->IsScheduled = true;
815       DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
816 
817       ScheduleData *BundleMember = SD;
818       while (BundleMember) {
819         // Handle the def-use chain dependencies.
820         for (Use &U : BundleMember->Inst->operands()) {
821           ScheduleData *OpDef = getScheduleData(U.get());
822           if (OpDef && OpDef->hasValidDependencies() &&
823               OpDef->incrementUnscheduledDeps(-1) == 0) {
824             // There are no more unscheduled dependencies after decrementing,
825             // so we can put the dependent instruction into the ready list.
826             ScheduleData *DepBundle = OpDef->FirstInBundle;
827             assert(!DepBundle->IsScheduled &&
828                    "already scheduled bundle gets ready");
829             ReadyList.insert(DepBundle);
830             DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
831           }
832         }
833         // Handle the memory dependencies.
834         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
835           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
836             // There are no more unscheduled dependencies after decrementing,
837             // so we can put the dependent instruction into the ready list.
838             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
839             assert(!DepBundle->IsScheduled &&
840                    "already scheduled bundle gets ready");
841             ReadyList.insert(DepBundle);
842             DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
843           }
844         }
845         BundleMember = BundleMember->NextInBundle;
846       }
847     }
848 
849     /// Put all instructions into the ReadyList which are ready for scheduling.
850     template <typename ReadyListType>
initialFillReadyList__anon6868a6e70111::BoUpSLP::BlockScheduling851     void initialFillReadyList(ReadyListType &ReadyList) {
852       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
853         ScheduleData *SD = getScheduleData(I);
854         if (SD->isSchedulingEntity() && SD->isReady()) {
855           ReadyList.insert(SD);
856           DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
857         }
858       }
859     }
860 
861     /// Checks if a bundle of instructions can be scheduled, i.e. has no
862     /// cyclic dependencies. This is only a dry-run, no instructions are
863     /// actually moved at this stage.
864     bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
865 
866     /// Un-bundles a group of instructions.
867     void cancelScheduling(ArrayRef<Value *> VL);
868 
869     /// Extends the scheduling region so that V is inside the region.
870     void extendSchedulingRegion(Value *V);
871 
872     /// Initialize the ScheduleData structures for new instructions in the
873     /// scheduling region.
874     void initScheduleData(Instruction *FromI, Instruction *ToI,
875                           ScheduleData *PrevLoadStore,
876                           ScheduleData *NextLoadStore);
877 
878     /// Updates the dependency information of a bundle and of all instructions/
879     /// bundles which depend on the original bundle.
880     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
881                                BoUpSLP *SLP);
882 
883     /// Sets all instruction in the scheduling region to un-scheduled.
884     void resetSchedule();
885 
886     BasicBlock *BB;
887 
888     /// Simple memory allocation for ScheduleData.
889     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
890 
891     /// The size of a ScheduleData array in ScheduleDataChunks.
892     int ChunkSize;
893 
894     /// The allocator position in the current chunk, which is the last entry
895     /// of ScheduleDataChunks.
896     int ChunkPos;
897 
898     /// Attaches ScheduleData to Instruction.
899     /// Note that the mapping survives during all vectorization iterations, i.e.
900     /// ScheduleData structures are recycled.
901     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
902 
903     struct ReadyList : SmallVector<ScheduleData *, 8> {
insert__anon6868a6e70111::BoUpSLP::BlockScheduling::ReadyList904       void insert(ScheduleData *SD) { push_back(SD); }
905     };
906 
907     /// The ready-list for scheduling (only used for the dry-run).
908     ReadyList ReadyInsts;
909 
910     /// The first instruction of the scheduling region.
911     Instruction *ScheduleStart;
912 
913     /// The first instruction _after_ the scheduling region.
914     Instruction *ScheduleEnd;
915 
916     /// The first memory accessing instruction in the scheduling region
917     /// (can be null).
918     ScheduleData *FirstLoadStoreInRegion;
919 
920     /// The last memory accessing instruction in the scheduling region
921     /// (can be null).
922     ScheduleData *LastLoadStoreInRegion;
923 
924     /// The ID of the scheduling region. For a new vectorization iteration this
925     /// is incremented which "removes" all ScheduleData from the region.
926     int SchedulingRegionID;
927   };
928 
929   /// Attaches the BlockScheduling structures to basic blocks.
930   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
931 
932   /// Performs the "real" scheduling. Done before vectorization is actually
933   /// performed in a basic block.
934   void scheduleBlock(BlockScheduling *BS);
935 
936   /// List of users to ignore during scheduling and that don't need extracting.
937   ArrayRef<Value *> UserIgnoreList;
938 
939   // Number of load-bundles, which contain consecutive loads.
940   int NumLoadsWantToKeepOrder;
941 
942   // Number of load-bundles of size 2, which are consecutive loads if reversed.
943   int NumLoadsWantToChangeOrder;
944 
945   // Analysis and block reference.
946   Function *F;
947   ScalarEvolution *SE;
948   const DataLayout *DL;
949   TargetTransformInfo *TTI;
950   TargetLibraryInfo *TLI;
951   AliasAnalysis *AA;
952   LoopInfo *LI;
953   DominatorTree *DT;
954   /// Instruction builder to construct the vectorized tree.
955   IRBuilder<> Builder;
956 };
957 
958 #ifndef NDEBUG
operator <<(raw_ostream & os,const BoUpSLP::ScheduleData & SD)959 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
960   SD.dump(os);
961   return os;
962 }
963 #endif
964 
buildTree(ArrayRef<Value * > Roots,ArrayRef<Value * > UserIgnoreLst)965 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
966                         ArrayRef<Value *> UserIgnoreLst) {
967   deleteTree();
968   UserIgnoreList = UserIgnoreLst;
969   if (!getSameType(Roots))
970     return;
971   buildTree_rec(Roots, 0);
972 
973   // Collect the values that we need to extract from the tree.
974   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
975     TreeEntry *Entry = &VectorizableTree[EIdx];
976 
977     // For each lane:
978     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
979       Value *Scalar = Entry->Scalars[Lane];
980 
981       // No need to handle users of gathered values.
982       if (Entry->NeedToGather)
983         continue;
984 
985       for (User *U : Scalar->users()) {
986         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
987 
988         Instruction *UserInst = dyn_cast<Instruction>(U);
989         if (!UserInst)
990           continue;
991 
992         // Skip in-tree scalars that become vectors
993         if (ScalarToTreeEntry.count(U)) {
994           int Idx = ScalarToTreeEntry[U];
995           TreeEntry *UseEntry = &VectorizableTree[Idx];
996           Value *UseScalar = UseEntry->Scalars[0];
997           // Some in-tree scalars will remain as scalar in vectorized
998           // instructions. If that is the case, the one in Lane 0 will
999           // be used.
1000           if (UseScalar != U ||
1001               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
1002             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
1003                          << ".\n");
1004             assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
1005             continue;
1006           }
1007         }
1008 
1009         // Ignore users in the user ignore list.
1010         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
1011             UserIgnoreList.end())
1012           continue;
1013 
1014         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
1015               Lane << " from " << *Scalar << ".\n");
1016         ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
1017       }
1018     }
1019   }
1020 }
1021 
1022 
buildTree_rec(ArrayRef<Value * > VL,unsigned Depth)1023 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
1024   bool SameTy = getSameType(VL); (void)SameTy;
1025   bool isAltShuffle = false;
1026   assert(SameTy && "Invalid types!");
1027 
1028   if (Depth == RecursionMaxDepth) {
1029     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
1030     newTreeEntry(VL, false);
1031     return;
1032   }
1033 
1034   // Don't handle vectors.
1035   if (VL[0]->getType()->isVectorTy()) {
1036     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
1037     newTreeEntry(VL, false);
1038     return;
1039   }
1040 
1041   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1042     if (SI->getValueOperand()->getType()->isVectorTy()) {
1043       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
1044       newTreeEntry(VL, false);
1045       return;
1046     }
1047   unsigned Opcode = getSameOpcode(VL);
1048 
1049   // Check that this shuffle vector refers to the alternate
1050   // sequence of opcodes.
1051   if (Opcode == Instruction::ShuffleVector) {
1052     Instruction *I0 = dyn_cast<Instruction>(VL[0]);
1053     unsigned Op = I0->getOpcode();
1054     if (Op != Instruction::ShuffleVector)
1055       isAltShuffle = true;
1056   }
1057 
1058   // If all of the operands are identical or constant we have a simple solution.
1059   if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
1060     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
1061     newTreeEntry(VL, false);
1062     return;
1063   }
1064 
1065   // We now know that this is a vector of instructions of the same type from
1066   // the same block.
1067 
1068   // Don't vectorize ephemeral values.
1069   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1070     if (EphValues.count(VL[i])) {
1071       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1072             ") is ephemeral.\n");
1073       newTreeEntry(VL, false);
1074       return;
1075     }
1076   }
1077 
1078   // Check if this is a duplicate of another entry.
1079   if (ScalarToTreeEntry.count(VL[0])) {
1080     int Idx = ScalarToTreeEntry[VL[0]];
1081     TreeEntry *E = &VectorizableTree[Idx];
1082     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1083       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1084       if (E->Scalars[i] != VL[i]) {
1085         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1086         newTreeEntry(VL, false);
1087         return;
1088       }
1089     }
1090     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1091     return;
1092   }
1093 
1094   // Check that none of the instructions in the bundle are already in the tree.
1095   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1096     if (ScalarToTreeEntry.count(VL[i])) {
1097       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1098             ") is already in tree.\n");
1099       newTreeEntry(VL, false);
1100       return;
1101     }
1102   }
1103 
1104   // If any of the scalars is marked as a value that needs to stay scalar then
1105   // we need to gather the scalars.
1106   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1107     if (MustGather.count(VL[i])) {
1108       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1109       newTreeEntry(VL, false);
1110       return;
1111     }
1112   }
1113 
1114   // Check that all of the users of the scalars that we want to vectorize are
1115   // schedulable.
1116   Instruction *VL0 = cast<Instruction>(VL[0]);
1117   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1118 
1119   if (!DT->isReachableFromEntry(BB)) {
1120     // Don't go into unreachable blocks. They may contain instructions with
1121     // dependency cycles which confuse the final scheduling.
1122     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1123     newTreeEntry(VL, false);
1124     return;
1125   }
1126 
1127   // Check that every instructions appears once in this bundle.
1128   for (unsigned i = 0, e = VL.size(); i < e; ++i)
1129     for (unsigned j = i+1; j < e; ++j)
1130       if (VL[i] == VL[j]) {
1131         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1132         newTreeEntry(VL, false);
1133         return;
1134       }
1135 
1136   auto &BSRef = BlocksSchedules[BB];
1137   if (!BSRef) {
1138     BSRef = llvm::make_unique<BlockScheduling>(BB);
1139   }
1140   BlockScheduling &BS = *BSRef.get();
1141 
1142   if (!BS.tryScheduleBundle(VL, this)) {
1143     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1144     BS.cancelScheduling(VL);
1145     newTreeEntry(VL, false);
1146     return;
1147   }
1148   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1149 
1150   switch (Opcode) {
1151     case Instruction::PHI: {
1152       PHINode *PH = dyn_cast<PHINode>(VL0);
1153 
1154       // Check for terminator values (e.g. invoke).
1155       for (unsigned j = 0; j < VL.size(); ++j)
1156         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1157           TerminatorInst *Term = dyn_cast<TerminatorInst>(
1158               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1159           if (Term) {
1160             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1161             BS.cancelScheduling(VL);
1162             newTreeEntry(VL, false);
1163             return;
1164           }
1165         }
1166 
1167       newTreeEntry(VL, true);
1168       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1169 
1170       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1171         ValueList Operands;
1172         // Prepare the operand vector.
1173         for (unsigned j = 0; j < VL.size(); ++j)
1174           Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1175               PH->getIncomingBlock(i)));
1176 
1177         buildTree_rec(Operands, Depth + 1);
1178       }
1179       return;
1180     }
1181     case Instruction::ExtractElement: {
1182       bool Reuse = CanReuseExtract(VL);
1183       if (Reuse) {
1184         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1185       } else {
1186         BS.cancelScheduling(VL);
1187       }
1188       newTreeEntry(VL, Reuse);
1189       return;
1190     }
1191     case Instruction::Load: {
1192       // Check if the loads are consecutive or of we need to swizzle them.
1193       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1194         LoadInst *L = cast<LoadInst>(VL[i]);
1195         if (!L->isSimple()) {
1196           BS.cancelScheduling(VL);
1197           newTreeEntry(VL, false);
1198           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1199           return;
1200         }
1201         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
1202           if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
1203             ++NumLoadsWantToChangeOrder;
1204           }
1205           BS.cancelScheduling(VL);
1206           newTreeEntry(VL, false);
1207           DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1208           return;
1209         }
1210       }
1211       ++NumLoadsWantToKeepOrder;
1212       newTreeEntry(VL, true);
1213       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1214       return;
1215     }
1216     case Instruction::ZExt:
1217     case Instruction::SExt:
1218     case Instruction::FPToUI:
1219     case Instruction::FPToSI:
1220     case Instruction::FPExt:
1221     case Instruction::PtrToInt:
1222     case Instruction::IntToPtr:
1223     case Instruction::SIToFP:
1224     case Instruction::UIToFP:
1225     case Instruction::Trunc:
1226     case Instruction::FPTrunc:
1227     case Instruction::BitCast: {
1228       Type *SrcTy = VL0->getOperand(0)->getType();
1229       for (unsigned i = 0; i < VL.size(); ++i) {
1230         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1231         if (Ty != SrcTy || !isValidElementType(Ty)) {
1232           BS.cancelScheduling(VL);
1233           newTreeEntry(VL, false);
1234           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1235           return;
1236         }
1237       }
1238       newTreeEntry(VL, true);
1239       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1240 
1241       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1242         ValueList Operands;
1243         // Prepare the operand vector.
1244         for (unsigned j = 0; j < VL.size(); ++j)
1245           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1246 
1247         buildTree_rec(Operands, Depth+1);
1248       }
1249       return;
1250     }
1251     case Instruction::ICmp:
1252     case Instruction::FCmp: {
1253       // Check that all of the compares have the same predicate.
1254       CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
1255       Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1256       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1257         CmpInst *Cmp = cast<CmpInst>(VL[i]);
1258         if (Cmp->getPredicate() != P0 ||
1259             Cmp->getOperand(0)->getType() != ComparedTy) {
1260           BS.cancelScheduling(VL);
1261           newTreeEntry(VL, false);
1262           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1263           return;
1264         }
1265       }
1266 
1267       newTreeEntry(VL, true);
1268       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1269 
1270       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1271         ValueList Operands;
1272         // Prepare the operand vector.
1273         for (unsigned j = 0; j < VL.size(); ++j)
1274           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1275 
1276         buildTree_rec(Operands, Depth+1);
1277       }
1278       return;
1279     }
1280     case Instruction::Select:
1281     case Instruction::Add:
1282     case Instruction::FAdd:
1283     case Instruction::Sub:
1284     case Instruction::FSub:
1285     case Instruction::Mul:
1286     case Instruction::FMul:
1287     case Instruction::UDiv:
1288     case Instruction::SDiv:
1289     case Instruction::FDiv:
1290     case Instruction::URem:
1291     case Instruction::SRem:
1292     case Instruction::FRem:
1293     case Instruction::Shl:
1294     case Instruction::LShr:
1295     case Instruction::AShr:
1296     case Instruction::And:
1297     case Instruction::Or:
1298     case Instruction::Xor: {
1299       newTreeEntry(VL, true);
1300       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1301 
1302       // Sort operands of the instructions so that each side is more likely to
1303       // have the same opcode.
1304       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1305         ValueList Left, Right;
1306         reorderInputsAccordingToOpcode(VL, Left, Right);
1307         buildTree_rec(Left, Depth + 1);
1308         buildTree_rec(Right, Depth + 1);
1309         return;
1310       }
1311 
1312       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1313         ValueList Operands;
1314         // Prepare the operand vector.
1315         for (unsigned j = 0; j < VL.size(); ++j)
1316           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1317 
1318         buildTree_rec(Operands, Depth+1);
1319       }
1320       return;
1321     }
1322     case Instruction::GetElementPtr: {
1323       // We don't combine GEPs with complicated (nested) indexing.
1324       for (unsigned j = 0; j < VL.size(); ++j) {
1325         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1326           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1327           BS.cancelScheduling(VL);
1328           newTreeEntry(VL, false);
1329           return;
1330         }
1331       }
1332 
1333       // We can't combine several GEPs into one vector if they operate on
1334       // different types.
1335       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1336       for (unsigned j = 0; j < VL.size(); ++j) {
1337         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1338         if (Ty0 != CurTy) {
1339           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1340           BS.cancelScheduling(VL);
1341           newTreeEntry(VL, false);
1342           return;
1343         }
1344       }
1345 
1346       // We don't combine GEPs with non-constant indexes.
1347       for (unsigned j = 0; j < VL.size(); ++j) {
1348         auto Op = cast<Instruction>(VL[j])->getOperand(1);
1349         if (!isa<ConstantInt>(Op)) {
1350           DEBUG(
1351               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1352           BS.cancelScheduling(VL);
1353           newTreeEntry(VL, false);
1354           return;
1355         }
1356       }
1357 
1358       newTreeEntry(VL, true);
1359       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1360       for (unsigned i = 0, e = 2; i < e; ++i) {
1361         ValueList Operands;
1362         // Prepare the operand vector.
1363         for (unsigned j = 0; j < VL.size(); ++j)
1364           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1365 
1366         buildTree_rec(Operands, Depth + 1);
1367       }
1368       return;
1369     }
1370     case Instruction::Store: {
1371       // Check if the stores are consecutive or of we need to swizzle them.
1372       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1373         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
1374           BS.cancelScheduling(VL);
1375           newTreeEntry(VL, false);
1376           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1377           return;
1378         }
1379 
1380       newTreeEntry(VL, true);
1381       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1382 
1383       ValueList Operands;
1384       for (unsigned j = 0; j < VL.size(); ++j)
1385         Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1386 
1387       buildTree_rec(Operands, Depth + 1);
1388       return;
1389     }
1390     case Instruction::Call: {
1391       // Check if the calls are all to the same vectorizable intrinsic.
1392       CallInst *CI = cast<CallInst>(VL[0]);
1393       // Check if this is an Intrinsic call or something that can be
1394       // represented by an intrinsic call
1395       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1396       if (!isTriviallyVectorizable(ID)) {
1397         BS.cancelScheduling(VL);
1398         newTreeEntry(VL, false);
1399         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1400         return;
1401       }
1402       Function *Int = CI->getCalledFunction();
1403       Value *A1I = nullptr;
1404       if (hasVectorInstrinsicScalarOpd(ID, 1))
1405         A1I = CI->getArgOperand(1);
1406       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1407         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1408         if (!CI2 || CI2->getCalledFunction() != Int ||
1409             getIntrinsicIDForCall(CI2, TLI) != ID) {
1410           BS.cancelScheduling(VL);
1411           newTreeEntry(VL, false);
1412           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1413                        << "\n");
1414           return;
1415         }
1416         // ctlz,cttz and powi are special intrinsics whose second argument
1417         // should be same in order for them to be vectorized.
1418         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1419           Value *A1J = CI2->getArgOperand(1);
1420           if (A1I != A1J) {
1421             BS.cancelScheduling(VL);
1422             newTreeEntry(VL, false);
1423             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1424                          << " argument "<< A1I<<"!=" << A1J
1425                          << "\n");
1426             return;
1427           }
1428         }
1429       }
1430 
1431       newTreeEntry(VL, true);
1432       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1433         ValueList Operands;
1434         // Prepare the operand vector.
1435         for (unsigned j = 0; j < VL.size(); ++j) {
1436           CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1437           Operands.push_back(CI2->getArgOperand(i));
1438         }
1439         buildTree_rec(Operands, Depth + 1);
1440       }
1441       return;
1442     }
1443     case Instruction::ShuffleVector: {
1444       // If this is not an alternate sequence of opcode like add-sub
1445       // then do not vectorize this instruction.
1446       if (!isAltShuffle) {
1447         BS.cancelScheduling(VL);
1448         newTreeEntry(VL, false);
1449         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1450         return;
1451       }
1452       newTreeEntry(VL, true);
1453       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1454       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1455         ValueList Operands;
1456         // Prepare the operand vector.
1457         for (unsigned j = 0; j < VL.size(); ++j)
1458           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1459 
1460         buildTree_rec(Operands, Depth + 1);
1461       }
1462       return;
1463     }
1464     default:
1465       BS.cancelScheduling(VL);
1466       newTreeEntry(VL, false);
1467       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1468       return;
1469   }
1470 }
1471 
getEntryCost(TreeEntry * E)1472 int BoUpSLP::getEntryCost(TreeEntry *E) {
1473   ArrayRef<Value*> VL = E->Scalars;
1474 
1475   Type *ScalarTy = VL[0]->getType();
1476   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1477     ScalarTy = SI->getValueOperand()->getType();
1478   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1479 
1480   if (E->NeedToGather) {
1481     if (allConstant(VL))
1482       return 0;
1483     if (isSplat(VL)) {
1484       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1485     }
1486     return getGatherCost(E->Scalars);
1487   }
1488   unsigned Opcode = getSameOpcode(VL);
1489   assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1490   Instruction *VL0 = cast<Instruction>(VL[0]);
1491   switch (Opcode) {
1492     case Instruction::PHI: {
1493       return 0;
1494     }
1495     case Instruction::ExtractElement: {
1496       if (CanReuseExtract(VL)) {
1497         int DeadCost = 0;
1498         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1499           ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1500           if (E->hasOneUse())
1501             // Take credit for instruction that will become dead.
1502             DeadCost +=
1503                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1504         }
1505         return -DeadCost;
1506       }
1507       return getGatherCost(VecTy);
1508     }
1509     case Instruction::ZExt:
1510     case Instruction::SExt:
1511     case Instruction::FPToUI:
1512     case Instruction::FPToSI:
1513     case Instruction::FPExt:
1514     case Instruction::PtrToInt:
1515     case Instruction::IntToPtr:
1516     case Instruction::SIToFP:
1517     case Instruction::UIToFP:
1518     case Instruction::Trunc:
1519     case Instruction::FPTrunc:
1520     case Instruction::BitCast: {
1521       Type *SrcTy = VL0->getOperand(0)->getType();
1522 
1523       // Calculate the cost of this instruction.
1524       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1525                                                          VL0->getType(), SrcTy);
1526 
1527       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1528       int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1529       return VecCost - ScalarCost;
1530     }
1531     case Instruction::FCmp:
1532     case Instruction::ICmp:
1533     case Instruction::Select:
1534     case Instruction::Add:
1535     case Instruction::FAdd:
1536     case Instruction::Sub:
1537     case Instruction::FSub:
1538     case Instruction::Mul:
1539     case Instruction::FMul:
1540     case Instruction::UDiv:
1541     case Instruction::SDiv:
1542     case Instruction::FDiv:
1543     case Instruction::URem:
1544     case Instruction::SRem:
1545     case Instruction::FRem:
1546     case Instruction::Shl:
1547     case Instruction::LShr:
1548     case Instruction::AShr:
1549     case Instruction::And:
1550     case Instruction::Or:
1551     case Instruction::Xor: {
1552       // Calculate the cost of this instruction.
1553       int ScalarCost = 0;
1554       int VecCost = 0;
1555       if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1556           Opcode == Instruction::Select) {
1557         VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1558         ScalarCost = VecTy->getNumElements() *
1559         TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1560         VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1561       } else {
1562         // Certain instructions can be cheaper to vectorize if they have a
1563         // constant second vector operand.
1564         TargetTransformInfo::OperandValueKind Op1VK =
1565             TargetTransformInfo::OK_AnyValue;
1566         TargetTransformInfo::OperandValueKind Op2VK =
1567             TargetTransformInfo::OK_UniformConstantValue;
1568         TargetTransformInfo::OperandValueProperties Op1VP =
1569             TargetTransformInfo::OP_None;
1570         TargetTransformInfo::OperandValueProperties Op2VP =
1571             TargetTransformInfo::OP_None;
1572 
1573         // If all operands are exactly the same ConstantInt then set the
1574         // operand kind to OK_UniformConstantValue.
1575         // If instead not all operands are constants, then set the operand kind
1576         // to OK_AnyValue. If all operands are constants but not the same,
1577         // then set the operand kind to OK_NonUniformConstantValue.
1578         ConstantInt *CInt = nullptr;
1579         for (unsigned i = 0; i < VL.size(); ++i) {
1580           const Instruction *I = cast<Instruction>(VL[i]);
1581           if (!isa<ConstantInt>(I->getOperand(1))) {
1582             Op2VK = TargetTransformInfo::OK_AnyValue;
1583             break;
1584           }
1585           if (i == 0) {
1586             CInt = cast<ConstantInt>(I->getOperand(1));
1587             continue;
1588           }
1589           if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1590               CInt != cast<ConstantInt>(I->getOperand(1)))
1591             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1592         }
1593         // FIXME: Currently cost of model modification for division by
1594         // power of 2 is handled only for X86. Add support for other targets.
1595         if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1596             CInt->getValue().isPowerOf2())
1597           Op2VP = TargetTransformInfo::OP_PowerOf2;
1598 
1599         ScalarCost = VecTy->getNumElements() *
1600                      TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1601                                                  Op1VP, Op2VP);
1602         VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1603                                               Op1VP, Op2VP);
1604       }
1605       return VecCost - ScalarCost;
1606     }
1607     case Instruction::GetElementPtr: {
1608       TargetTransformInfo::OperandValueKind Op1VK =
1609           TargetTransformInfo::OK_AnyValue;
1610       TargetTransformInfo::OperandValueKind Op2VK =
1611           TargetTransformInfo::OK_UniformConstantValue;
1612 
1613       int ScalarCost =
1614           VecTy->getNumElements() *
1615           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1616       int VecCost =
1617           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1618 
1619       return VecCost - ScalarCost;
1620     }
1621     case Instruction::Load: {
1622       // Cost of wide load - cost of scalar loads.
1623       int ScalarLdCost = VecTy->getNumElements() *
1624       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1625       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1626       return VecLdCost - ScalarLdCost;
1627     }
1628     case Instruction::Store: {
1629       // We know that we can merge the stores. Calculate the cost.
1630       int ScalarStCost = VecTy->getNumElements() *
1631       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1632       int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1633       return VecStCost - ScalarStCost;
1634     }
1635     case Instruction::Call: {
1636       CallInst *CI = cast<CallInst>(VL0);
1637       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1638 
1639       // Calculate the cost of the scalar and vector calls.
1640       SmallVector<Type*, 4> ScalarTys, VecTys;
1641       for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1642         ScalarTys.push_back(CI->getArgOperand(op)->getType());
1643         VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1644                                          VecTy->getNumElements()));
1645       }
1646 
1647       int ScalarCallCost = VecTy->getNumElements() *
1648           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1649 
1650       int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1651 
1652       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1653             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
1654             << " for " << *CI << "\n");
1655 
1656       return VecCallCost - ScalarCallCost;
1657     }
1658     case Instruction::ShuffleVector: {
1659       TargetTransformInfo::OperandValueKind Op1VK =
1660           TargetTransformInfo::OK_AnyValue;
1661       TargetTransformInfo::OperandValueKind Op2VK =
1662           TargetTransformInfo::OK_AnyValue;
1663       int ScalarCost = 0;
1664       int VecCost = 0;
1665       for (unsigned i = 0; i < VL.size(); ++i) {
1666         Instruction *I = cast<Instruction>(VL[i]);
1667         if (!I)
1668           break;
1669         ScalarCost +=
1670             TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1671       }
1672       // VecCost is equal to sum of the cost of creating 2 vectors
1673       // and the cost of creating shuffle.
1674       Instruction *I0 = cast<Instruction>(VL[0]);
1675       VecCost =
1676           TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1677       Instruction *I1 = cast<Instruction>(VL[1]);
1678       VecCost +=
1679           TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1680       VecCost +=
1681           TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1682       return VecCost - ScalarCost;
1683     }
1684     default:
1685       llvm_unreachable("Unknown instruction");
1686   }
1687 }
1688 
isFullyVectorizableTinyTree()1689 bool BoUpSLP::isFullyVectorizableTinyTree() {
1690   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1691         VectorizableTree.size() << " is fully vectorizable .\n");
1692 
1693   // We only handle trees of height 2.
1694   if (VectorizableTree.size() != 2)
1695     return false;
1696 
1697   // Handle splat stores.
1698   if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
1699     return true;
1700 
1701   // Gathering cost would be too much for tiny trees.
1702   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1703     return false;
1704 
1705   return true;
1706 }
1707 
getSpillCost()1708 int BoUpSLP::getSpillCost() {
1709   // Walk from the bottom of the tree to the top, tracking which values are
1710   // live. When we see a call instruction that is not part of our tree,
1711   // query TTI to see if there is a cost to keeping values live over it
1712   // (for example, if spills and fills are required).
1713   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1714   int Cost = 0;
1715 
1716   SmallPtrSet<Instruction*, 4> LiveValues;
1717   Instruction *PrevInst = nullptr;
1718 
1719   for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1720     Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1721     if (!Inst)
1722       continue;
1723 
1724     if (!PrevInst) {
1725       PrevInst = Inst;
1726       continue;
1727     }
1728 
1729     DEBUG(
1730       dbgs() << "SLP: #LV: " << LiveValues.size();
1731       for (auto *X : LiveValues)
1732         dbgs() << " " << X->getName();
1733       dbgs() << ", Looking at ";
1734       Inst->dump();
1735       );
1736 
1737     // Update LiveValues.
1738     LiveValues.erase(PrevInst);
1739     for (auto &J : PrevInst->operands()) {
1740       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1741         LiveValues.insert(cast<Instruction>(&*J));
1742     }
1743 
1744     // Now find the sequence of instructions between PrevInst and Inst.
1745     BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
1746     --PrevInstIt;
1747     while (InstIt != PrevInstIt) {
1748       if (PrevInstIt == PrevInst->getParent()->rend()) {
1749         PrevInstIt = Inst->getParent()->rbegin();
1750         continue;
1751       }
1752 
1753       if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1754         SmallVector<Type*, 4> V;
1755         for (auto *II : LiveValues)
1756           V.push_back(VectorType::get(II->getType(), BundleWidth));
1757         Cost += TTI->getCostOfKeepingLiveOverCall(V);
1758       }
1759 
1760       ++PrevInstIt;
1761     }
1762 
1763     PrevInst = Inst;
1764   }
1765 
1766   DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1767   return Cost;
1768 }
1769 
getTreeCost()1770 int BoUpSLP::getTreeCost() {
1771   int Cost = 0;
1772   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1773         VectorizableTree.size() << ".\n");
1774 
1775   // We only vectorize tiny trees if it is fully vectorizable.
1776   if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1777     if (!VectorizableTree.size()) {
1778       assert(!ExternalUses.size() && "We should not have any external users");
1779     }
1780     return INT_MAX;
1781   }
1782 
1783   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1784 
1785   for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1786     int C = getEntryCost(&VectorizableTree[i]);
1787     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1788           << *VectorizableTree[i].Scalars[0] << " .\n");
1789     Cost += C;
1790   }
1791 
1792   SmallSet<Value *, 16> ExtractCostCalculated;
1793   int ExtractCost = 0;
1794   for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1795        I != E; ++I) {
1796     // We only add extract cost once for the same scalar.
1797     if (!ExtractCostCalculated.insert(I->Scalar).second)
1798       continue;
1799 
1800     // Uses by ephemeral values are free (because the ephemeral value will be
1801     // removed prior to code generation, and so the extraction will be
1802     // removed as well).
1803     if (EphValues.count(I->User))
1804       continue;
1805 
1806     VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
1807     ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1808                                            I->Lane);
1809   }
1810 
1811   Cost += getSpillCost();
1812 
1813   DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1814   return  Cost + ExtractCost;
1815 }
1816 
getGatherCost(Type * Ty)1817 int BoUpSLP::getGatherCost(Type *Ty) {
1818   int Cost = 0;
1819   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1820     Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1821   return Cost;
1822 }
1823 
getGatherCost(ArrayRef<Value * > VL)1824 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1825   // Find the type of the operands in VL.
1826   Type *ScalarTy = VL[0]->getType();
1827   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1828     ScalarTy = SI->getValueOperand()->getType();
1829   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1830   // Find the cost of inserting/extracting values from the vector.
1831   return getGatherCost(VecTy);
1832 }
1833 
getPointerOperand(Value * I)1834 Value *BoUpSLP::getPointerOperand(Value *I) {
1835   if (LoadInst *LI = dyn_cast<LoadInst>(I))
1836     return LI->getPointerOperand();
1837   if (StoreInst *SI = dyn_cast<StoreInst>(I))
1838     return SI->getPointerOperand();
1839   return nullptr;
1840 }
1841 
getAddressSpaceOperand(Value * I)1842 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
1843   if (LoadInst *L = dyn_cast<LoadInst>(I))
1844     return L->getPointerAddressSpace();
1845   if (StoreInst *S = dyn_cast<StoreInst>(I))
1846     return S->getPointerAddressSpace();
1847   return -1;
1848 }
1849 
isConsecutiveAccess(Value * A,Value * B)1850 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
1851   Value *PtrA = getPointerOperand(A);
1852   Value *PtrB = getPointerOperand(B);
1853   unsigned ASA = getAddressSpaceOperand(A);
1854   unsigned ASB = getAddressSpaceOperand(B);
1855 
1856   // Check that the address spaces match and that the pointers are valid.
1857   if (!PtrA || !PtrB || (ASA != ASB))
1858     return false;
1859 
1860   // Make sure that A and B are different pointers of the same type.
1861   if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
1862     return false;
1863 
1864   unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
1865   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1866   APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
1867 
1868   APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1869   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
1870   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
1871 
1872   APInt OffsetDelta = OffsetB - OffsetA;
1873 
1874   // Check if they are based on the same pointer. That makes the offsets
1875   // sufficient.
1876   if (PtrA == PtrB)
1877     return OffsetDelta == Size;
1878 
1879   // Compute the necessary base pointer delta to have the necessary final delta
1880   // equal to the size.
1881   APInt BaseDelta = Size - OffsetDelta;
1882 
1883   // Otherwise compute the distance with SCEV between the base pointers.
1884   const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1885   const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1886   const SCEV *C = SE->getConstant(BaseDelta);
1887   const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
1888   return X == PtrSCEVB;
1889 }
1890 
setInsertPointAfterBundle(ArrayRef<Value * > VL)1891 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
1892   Instruction *VL0 = cast<Instruction>(VL[0]);
1893   BasicBlock::iterator NextInst = VL0;
1894   ++NextInst;
1895   Builder.SetInsertPoint(VL0->getParent(), NextInst);
1896   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
1897 }
1898 
Gather(ArrayRef<Value * > VL,VectorType * Ty)1899 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
1900   Value *Vec = UndefValue::get(Ty);
1901   // Generate the 'InsertElement' instruction.
1902   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
1903     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
1904     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
1905       GatherSeq.insert(Insrt);
1906       CSEBlocks.insert(Insrt->getParent());
1907 
1908       // Add to our 'need-to-extract' list.
1909       if (ScalarToTreeEntry.count(VL[i])) {
1910         int Idx = ScalarToTreeEntry[VL[i]];
1911         TreeEntry *E = &VectorizableTree[Idx];
1912         // Find which lane we need to extract.
1913         int FoundLane = -1;
1914         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
1915           // Is this the lane of the scalar that we are looking for ?
1916           if (E->Scalars[Lane] == VL[i]) {
1917             FoundLane = Lane;
1918             break;
1919           }
1920         }
1921         assert(FoundLane >= 0 && "Could not find the correct lane");
1922         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
1923       }
1924     }
1925   }
1926 
1927   return Vec;
1928 }
1929 
alreadyVectorized(ArrayRef<Value * > VL) const1930 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
1931   SmallDenseMap<Value*, int>::const_iterator Entry
1932     = ScalarToTreeEntry.find(VL[0]);
1933   if (Entry != ScalarToTreeEntry.end()) {
1934     int Idx = Entry->second;
1935     const TreeEntry *En = &VectorizableTree[Idx];
1936     if (En->isSame(VL) && En->VectorizedValue)
1937       return En->VectorizedValue;
1938   }
1939   return nullptr;
1940 }
1941 
vectorizeTree(ArrayRef<Value * > VL)1942 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
1943   if (ScalarToTreeEntry.count(VL[0])) {
1944     int Idx = ScalarToTreeEntry[VL[0]];
1945     TreeEntry *E = &VectorizableTree[Idx];
1946     if (E->isSame(VL))
1947       return vectorizeTree(E);
1948   }
1949 
1950   Type *ScalarTy = VL[0]->getType();
1951   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1952     ScalarTy = SI->getValueOperand()->getType();
1953   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1954 
1955   return Gather(VL, VecTy);
1956 }
1957 
vectorizeTree(TreeEntry * E)1958 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1959   IRBuilder<>::InsertPointGuard Guard(Builder);
1960 
1961   if (E->VectorizedValue) {
1962     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
1963     return E->VectorizedValue;
1964   }
1965 
1966   Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
1967   Type *ScalarTy = VL0->getType();
1968   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
1969     ScalarTy = SI->getValueOperand()->getType();
1970   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
1971 
1972   if (E->NeedToGather) {
1973     setInsertPointAfterBundle(E->Scalars);
1974     return Gather(E->Scalars, VecTy);
1975   }
1976 
1977   unsigned Opcode = getSameOpcode(E->Scalars);
1978 
1979   switch (Opcode) {
1980     case Instruction::PHI: {
1981       PHINode *PH = dyn_cast<PHINode>(VL0);
1982       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
1983       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
1984       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
1985       E->VectorizedValue = NewPhi;
1986 
1987       // PHINodes may have multiple entries from the same block. We want to
1988       // visit every block once.
1989       SmallSet<BasicBlock*, 4> VisitedBBs;
1990 
1991       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1992         ValueList Operands;
1993         BasicBlock *IBB = PH->getIncomingBlock(i);
1994 
1995         if (!VisitedBBs.insert(IBB).second) {
1996           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
1997           continue;
1998         }
1999 
2000         // Prepare the operand vector.
2001         for (unsigned j = 0; j < E->Scalars.size(); ++j)
2002           Operands.push_back(cast<PHINode>(E->Scalars[j])->
2003                              getIncomingValueForBlock(IBB));
2004 
2005         Builder.SetInsertPoint(IBB->getTerminator());
2006         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2007         Value *Vec = vectorizeTree(Operands);
2008         NewPhi->addIncoming(Vec, IBB);
2009       }
2010 
2011       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2012              "Invalid number of incoming values");
2013       return NewPhi;
2014     }
2015 
2016     case Instruction::ExtractElement: {
2017       if (CanReuseExtract(E->Scalars)) {
2018         Value *V = VL0->getOperand(0);
2019         E->VectorizedValue = V;
2020         return V;
2021       }
2022       return Gather(E->Scalars, VecTy);
2023     }
2024     case Instruction::ZExt:
2025     case Instruction::SExt:
2026     case Instruction::FPToUI:
2027     case Instruction::FPToSI:
2028     case Instruction::FPExt:
2029     case Instruction::PtrToInt:
2030     case Instruction::IntToPtr:
2031     case Instruction::SIToFP:
2032     case Instruction::UIToFP:
2033     case Instruction::Trunc:
2034     case Instruction::FPTrunc:
2035     case Instruction::BitCast: {
2036       ValueList INVL;
2037       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2038         INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2039 
2040       setInsertPointAfterBundle(E->Scalars);
2041 
2042       Value *InVec = vectorizeTree(INVL);
2043 
2044       if (Value *V = alreadyVectorized(E->Scalars))
2045         return V;
2046 
2047       CastInst *CI = dyn_cast<CastInst>(VL0);
2048       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2049       E->VectorizedValue = V;
2050       ++NumVectorInstructions;
2051       return V;
2052     }
2053     case Instruction::FCmp:
2054     case Instruction::ICmp: {
2055       ValueList LHSV, RHSV;
2056       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2057         LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2058         RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2059       }
2060 
2061       setInsertPointAfterBundle(E->Scalars);
2062 
2063       Value *L = vectorizeTree(LHSV);
2064       Value *R = vectorizeTree(RHSV);
2065 
2066       if (Value *V = alreadyVectorized(E->Scalars))
2067         return V;
2068 
2069       CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
2070       Value *V;
2071       if (Opcode == Instruction::FCmp)
2072         V = Builder.CreateFCmp(P0, L, R);
2073       else
2074         V = Builder.CreateICmp(P0, L, R);
2075 
2076       E->VectorizedValue = V;
2077       ++NumVectorInstructions;
2078       return V;
2079     }
2080     case Instruction::Select: {
2081       ValueList TrueVec, FalseVec, CondVec;
2082       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2083         CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2084         TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2085         FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
2086       }
2087 
2088       setInsertPointAfterBundle(E->Scalars);
2089 
2090       Value *Cond = vectorizeTree(CondVec);
2091       Value *True = vectorizeTree(TrueVec);
2092       Value *False = vectorizeTree(FalseVec);
2093 
2094       if (Value *V = alreadyVectorized(E->Scalars))
2095         return V;
2096 
2097       Value *V = Builder.CreateSelect(Cond, True, False);
2098       E->VectorizedValue = V;
2099       ++NumVectorInstructions;
2100       return V;
2101     }
2102     case Instruction::Add:
2103     case Instruction::FAdd:
2104     case Instruction::Sub:
2105     case Instruction::FSub:
2106     case Instruction::Mul:
2107     case Instruction::FMul:
2108     case Instruction::UDiv:
2109     case Instruction::SDiv:
2110     case Instruction::FDiv:
2111     case Instruction::URem:
2112     case Instruction::SRem:
2113     case Instruction::FRem:
2114     case Instruction::Shl:
2115     case Instruction::LShr:
2116     case Instruction::AShr:
2117     case Instruction::And:
2118     case Instruction::Or:
2119     case Instruction::Xor: {
2120       ValueList LHSVL, RHSVL;
2121       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2122         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2123       else
2124         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2125           LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2126           RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2127         }
2128 
2129       setInsertPointAfterBundle(E->Scalars);
2130 
2131       Value *LHS = vectorizeTree(LHSVL);
2132       Value *RHS = vectorizeTree(RHSVL);
2133 
2134       if (LHS == RHS && isa<Instruction>(LHS)) {
2135         assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2136       }
2137 
2138       if (Value *V = alreadyVectorized(E->Scalars))
2139         return V;
2140 
2141       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2142       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2143       E->VectorizedValue = V;
2144       propagateIRFlags(E->VectorizedValue, E->Scalars);
2145       ++NumVectorInstructions;
2146 
2147       if (Instruction *I = dyn_cast<Instruction>(V))
2148         return propagateMetadata(I, E->Scalars);
2149 
2150       return V;
2151     }
2152     case Instruction::Load: {
2153       // Loads are inserted at the head of the tree because we don't want to
2154       // sink them all the way down past store instructions.
2155       setInsertPointAfterBundle(E->Scalars);
2156 
2157       LoadInst *LI = cast<LoadInst>(VL0);
2158       Type *ScalarLoadTy = LI->getType();
2159       unsigned AS = LI->getPointerAddressSpace();
2160 
2161       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2162                                             VecTy->getPointerTo(AS));
2163 
2164       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2165       // ExternalUses list to make sure that an extract will be generated in the
2166       // future.
2167       if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2168         ExternalUses.push_back(
2169             ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2170 
2171       unsigned Alignment = LI->getAlignment();
2172       LI = Builder.CreateLoad(VecPtr);
2173       if (!Alignment)
2174         Alignment = DL->getABITypeAlignment(ScalarLoadTy);
2175       LI->setAlignment(Alignment);
2176       E->VectorizedValue = LI;
2177       ++NumVectorInstructions;
2178       return propagateMetadata(LI, E->Scalars);
2179     }
2180     case Instruction::Store: {
2181       StoreInst *SI = cast<StoreInst>(VL0);
2182       unsigned Alignment = SI->getAlignment();
2183       unsigned AS = SI->getPointerAddressSpace();
2184 
2185       ValueList ValueOp;
2186       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2187         ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
2188 
2189       setInsertPointAfterBundle(E->Scalars);
2190 
2191       Value *VecValue = vectorizeTree(ValueOp);
2192       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2193                                             VecTy->getPointerTo(AS));
2194       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2195 
2196       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2197       // ExternalUses list to make sure that an extract will be generated in the
2198       // future.
2199       if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2200         ExternalUses.push_back(
2201             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2202 
2203       if (!Alignment)
2204         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
2205       S->setAlignment(Alignment);
2206       E->VectorizedValue = S;
2207       ++NumVectorInstructions;
2208       return propagateMetadata(S, E->Scalars);
2209     }
2210     case Instruction::GetElementPtr: {
2211       setInsertPointAfterBundle(E->Scalars);
2212 
2213       ValueList Op0VL;
2214       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2215         Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
2216 
2217       Value *Op0 = vectorizeTree(Op0VL);
2218 
2219       std::vector<Value *> OpVecs;
2220       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2221            ++j) {
2222         ValueList OpVL;
2223         for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2224           OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
2225 
2226         Value *OpVec = vectorizeTree(OpVL);
2227         OpVecs.push_back(OpVec);
2228       }
2229 
2230       Value *V = Builder.CreateGEP(Op0, OpVecs);
2231       E->VectorizedValue = V;
2232       ++NumVectorInstructions;
2233 
2234       if (Instruction *I = dyn_cast<Instruction>(V))
2235         return propagateMetadata(I, E->Scalars);
2236 
2237       return V;
2238     }
2239     case Instruction::Call: {
2240       CallInst *CI = cast<CallInst>(VL0);
2241       setInsertPointAfterBundle(E->Scalars);
2242       Function *FI;
2243       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
2244       Value *ScalarArg = nullptr;
2245       if (CI && (FI = CI->getCalledFunction())) {
2246         IID = (Intrinsic::ID) FI->getIntrinsicID();
2247       }
2248       std::vector<Value *> OpVecs;
2249       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2250         ValueList OpVL;
2251         // ctlz,cttz and powi are special intrinsics whose second argument is
2252         // a scalar. This argument should not be vectorized.
2253         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2254           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2255           ScalarArg = CEI->getArgOperand(j);
2256           OpVecs.push_back(CEI->getArgOperand(j));
2257           continue;
2258         }
2259         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2260           CallInst *CEI = cast<CallInst>(E->Scalars[i]);
2261           OpVL.push_back(CEI->getArgOperand(j));
2262         }
2263 
2264         Value *OpVec = vectorizeTree(OpVL);
2265         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2266         OpVecs.push_back(OpVec);
2267       }
2268 
2269       Module *M = F->getParent();
2270       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2271       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2272       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2273       Value *V = Builder.CreateCall(CF, OpVecs);
2274 
2275       // The scalar argument uses an in-tree scalar so we add the new vectorized
2276       // call to ExternalUses list to make sure that an extract will be
2277       // generated in the future.
2278       if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2279         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2280 
2281       E->VectorizedValue = V;
2282       ++NumVectorInstructions;
2283       return V;
2284     }
2285     case Instruction::ShuffleVector: {
2286       ValueList LHSVL, RHSVL;
2287       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2288         LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2289         RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2290       }
2291       setInsertPointAfterBundle(E->Scalars);
2292 
2293       Value *LHS = vectorizeTree(LHSVL);
2294       Value *RHS = vectorizeTree(RHSVL);
2295 
2296       if (Value *V = alreadyVectorized(E->Scalars))
2297         return V;
2298 
2299       // Create a vector of LHS op1 RHS
2300       BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2301       Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2302 
2303       // Create a vector of LHS op2 RHS
2304       Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2305       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2306       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2307 
2308       // Create shuffle to take alternate operations from the vector.
2309       // Also, gather up odd and even scalar ops to propagate IR flags to
2310       // each vector operation.
2311       ValueList OddScalars, EvenScalars;
2312       unsigned e = E->Scalars.size();
2313       SmallVector<Constant *, 8> Mask(e);
2314       for (unsigned i = 0; i < e; ++i) {
2315         if (i & 1) {
2316           Mask[i] = Builder.getInt32(e + i);
2317           OddScalars.push_back(E->Scalars[i]);
2318         } else {
2319           Mask[i] = Builder.getInt32(i);
2320           EvenScalars.push_back(E->Scalars[i]);
2321         }
2322       }
2323 
2324       Value *ShuffleMask = ConstantVector::get(Mask);
2325       propagateIRFlags(V0, EvenScalars);
2326       propagateIRFlags(V1, OddScalars);
2327 
2328       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2329       E->VectorizedValue = V;
2330       ++NumVectorInstructions;
2331       if (Instruction *I = dyn_cast<Instruction>(V))
2332         return propagateMetadata(I, E->Scalars);
2333 
2334       return V;
2335     }
2336     default:
2337     llvm_unreachable("unknown inst");
2338   }
2339   return nullptr;
2340 }
2341 
vectorizeTree()2342 Value *BoUpSLP::vectorizeTree() {
2343 
2344   // All blocks must be scheduled before any instructions are inserted.
2345   for (auto &BSIter : BlocksSchedules) {
2346     scheduleBlock(BSIter.second.get());
2347   }
2348 
2349   Builder.SetInsertPoint(F->getEntryBlock().begin());
2350   vectorizeTree(&VectorizableTree[0]);
2351 
2352   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2353 
2354   // Extract all of the elements with the external uses.
2355   for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2356        it != e; ++it) {
2357     Value *Scalar = it->Scalar;
2358     llvm::User *User = it->User;
2359 
2360     // Skip users that we already RAUW. This happens when one instruction
2361     // has multiple uses of the same value.
2362     if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2363         Scalar->user_end())
2364       continue;
2365     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2366 
2367     int Idx = ScalarToTreeEntry[Scalar];
2368     TreeEntry *E = &VectorizableTree[Idx];
2369     assert(!E->NeedToGather && "Extracting from a gather list");
2370 
2371     Value *Vec = E->VectorizedValue;
2372     assert(Vec && "Can't find vectorizable value");
2373 
2374     Value *Lane = Builder.getInt32(it->Lane);
2375     // Generate extracts for out-of-tree users.
2376     // Find the insertion point for the extractelement lane.
2377     if (isa<Instruction>(Vec)){
2378       if (PHINode *PH = dyn_cast<PHINode>(User)) {
2379         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2380           if (PH->getIncomingValue(i) == Scalar) {
2381             Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2382             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2383             CSEBlocks.insert(PH->getIncomingBlock(i));
2384             PH->setOperand(i, Ex);
2385           }
2386         }
2387       } else {
2388         Builder.SetInsertPoint(cast<Instruction>(User));
2389         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2390         CSEBlocks.insert(cast<Instruction>(User)->getParent());
2391         User->replaceUsesOfWith(Scalar, Ex);
2392      }
2393     } else {
2394       Builder.SetInsertPoint(F->getEntryBlock().begin());
2395       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2396       CSEBlocks.insert(&F->getEntryBlock());
2397       User->replaceUsesOfWith(Scalar, Ex);
2398     }
2399 
2400     DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2401   }
2402 
2403   // For each vectorized value:
2404   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2405     TreeEntry *Entry = &VectorizableTree[EIdx];
2406 
2407     // For each lane:
2408     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2409       Value *Scalar = Entry->Scalars[Lane];
2410       // No need to handle users of gathered values.
2411       if (Entry->NeedToGather)
2412         continue;
2413 
2414       assert(Entry->VectorizedValue && "Can't find vectorizable value");
2415 
2416       Type *Ty = Scalar->getType();
2417       if (!Ty->isVoidTy()) {
2418 #ifndef NDEBUG
2419         for (User *U : Scalar->users()) {
2420           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2421 
2422           assert((ScalarToTreeEntry.count(U) ||
2423                   // It is legal to replace users in the ignorelist by undef.
2424                   (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2425                    UserIgnoreList.end())) &&
2426                  "Replacing out-of-tree value with undef");
2427         }
2428 #endif
2429         Value *Undef = UndefValue::get(Ty);
2430         Scalar->replaceAllUsesWith(Undef);
2431       }
2432       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2433       eraseInstruction(cast<Instruction>(Scalar));
2434     }
2435   }
2436 
2437   Builder.ClearInsertionPoint();
2438 
2439   return VectorizableTree[0].VectorizedValue;
2440 }
2441 
optimizeGatherSequence()2442 void BoUpSLP::optimizeGatherSequence() {
2443   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2444         << " gather sequences instructions.\n");
2445   // LICM InsertElementInst sequences.
2446   for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2447        e = GatherSeq.end(); it != e; ++it) {
2448     InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2449 
2450     if (!Insert)
2451       continue;
2452 
2453     // Check if this block is inside a loop.
2454     Loop *L = LI->getLoopFor(Insert->getParent());
2455     if (!L)
2456       continue;
2457 
2458     // Check if it has a preheader.
2459     BasicBlock *PreHeader = L->getLoopPreheader();
2460     if (!PreHeader)
2461       continue;
2462 
2463     // If the vector or the element that we insert into it are
2464     // instructions that are defined in this basic block then we can't
2465     // hoist this instruction.
2466     Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2467     Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2468     if (CurrVec && L->contains(CurrVec))
2469       continue;
2470     if (NewElem && L->contains(NewElem))
2471       continue;
2472 
2473     // We can hoist this instruction. Move it to the pre-header.
2474     Insert->moveBefore(PreHeader->getTerminator());
2475   }
2476 
2477   // Make a list of all reachable blocks in our CSE queue.
2478   SmallVector<const DomTreeNode *, 8> CSEWorkList;
2479   CSEWorkList.reserve(CSEBlocks.size());
2480   for (BasicBlock *BB : CSEBlocks)
2481     if (DomTreeNode *N = DT->getNode(BB)) {
2482       assert(DT->isReachableFromEntry(N));
2483       CSEWorkList.push_back(N);
2484     }
2485 
2486   // Sort blocks by domination. This ensures we visit a block after all blocks
2487   // dominating it are visited.
2488   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2489                    [this](const DomTreeNode *A, const DomTreeNode *B) {
2490     return DT->properlyDominates(A, B);
2491   });
2492 
2493   // Perform O(N^2) search over the gather sequences and merge identical
2494   // instructions. TODO: We can further optimize this scan if we split the
2495   // instructions into different buckets based on the insert lane.
2496   SmallVector<Instruction *, 16> Visited;
2497   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2498     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2499            "Worklist not sorted properly!");
2500     BasicBlock *BB = (*I)->getBlock();
2501     // For all instructions in blocks containing gather sequences:
2502     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2503       Instruction *In = it++;
2504       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2505         continue;
2506 
2507       // Check if we can replace this instruction with any of the
2508       // visited instructions.
2509       for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2510                                                     ve = Visited.end();
2511            v != ve; ++v) {
2512         if (In->isIdenticalTo(*v) &&
2513             DT->dominates((*v)->getParent(), In->getParent())) {
2514           In->replaceAllUsesWith(*v);
2515           eraseInstruction(In);
2516           In = nullptr;
2517           break;
2518         }
2519       }
2520       if (In) {
2521         assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2522         Visited.push_back(In);
2523       }
2524     }
2525   }
2526   CSEBlocks.clear();
2527   GatherSeq.clear();
2528 }
2529 
2530 // Groups the instructions to a bundle (which is then a single scheduling entity)
2531 // and schedules instructions until the bundle gets ready.
tryScheduleBundle(ArrayRef<Value * > VL,BoUpSLP * SLP)2532 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2533                                                  BoUpSLP *SLP) {
2534   if (isa<PHINode>(VL[0]))
2535     return true;
2536 
2537   // Initialize the instruction bundle.
2538   Instruction *OldScheduleEnd = ScheduleEnd;
2539   ScheduleData *PrevInBundle = nullptr;
2540   ScheduleData *Bundle = nullptr;
2541   bool ReSchedule = false;
2542   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
2543   for (Value *V : VL) {
2544     extendSchedulingRegion(V);
2545     ScheduleData *BundleMember = getScheduleData(V);
2546     assert(BundleMember &&
2547            "no ScheduleData for bundle member (maybe not in same basic block)");
2548     if (BundleMember->IsScheduled) {
2549       // A bundle member was scheduled as single instruction before and now
2550       // needs to be scheduled as part of the bundle. We just get rid of the
2551       // existing schedule.
2552       DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
2553                    << " was already scheduled\n");
2554       ReSchedule = true;
2555     }
2556     assert(BundleMember->isSchedulingEntity() &&
2557            "bundle member already part of other bundle");
2558     if (PrevInBundle) {
2559       PrevInBundle->NextInBundle = BundleMember;
2560     } else {
2561       Bundle = BundleMember;
2562     }
2563     BundleMember->UnscheduledDepsInBundle = 0;
2564     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2565 
2566     // Group the instructions to a bundle.
2567     BundleMember->FirstInBundle = Bundle;
2568     PrevInBundle = BundleMember;
2569   }
2570   if (ScheduleEnd != OldScheduleEnd) {
2571     // The scheduling region got new instructions at the lower end (or it is a
2572     // new region for the first bundle). This makes it necessary to
2573     // recalculate all dependencies.
2574     // It is seldom that this needs to be done a second time after adding the
2575     // initial bundle to the region.
2576     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2577       ScheduleData *SD = getScheduleData(I);
2578       SD->clearDependencies();
2579     }
2580     ReSchedule = true;
2581   }
2582   if (ReSchedule) {
2583     resetSchedule();
2584     initialFillReadyList(ReadyInsts);
2585   }
2586 
2587   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2588                << BB->getName() << "\n");
2589 
2590   calculateDependencies(Bundle, true, SLP);
2591 
2592   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2593   // means that there are no cyclic dependencies and we can schedule it.
2594   // Note that's important that we don't "schedule" the bundle yet (see
2595   // cancelScheduling).
2596   while (!Bundle->isReady() && !ReadyInsts.empty()) {
2597 
2598     ScheduleData *pickedSD = ReadyInsts.back();
2599     ReadyInsts.pop_back();
2600 
2601     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2602       schedule(pickedSD, ReadyInsts);
2603     }
2604   }
2605   return Bundle->isReady();
2606 }
2607 
cancelScheduling(ArrayRef<Value * > VL)2608 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2609   if (isa<PHINode>(VL[0]))
2610     return;
2611 
2612   ScheduleData *Bundle = getScheduleData(VL[0]);
2613   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
2614   assert(!Bundle->IsScheduled &&
2615          "Can't cancel bundle which is already scheduled");
2616   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2617          "tried to unbundle something which is not a bundle");
2618 
2619   // Un-bundle: make single instructions out of the bundle.
2620   ScheduleData *BundleMember = Bundle;
2621   while (BundleMember) {
2622     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2623     BundleMember->FirstInBundle = BundleMember;
2624     ScheduleData *Next = BundleMember->NextInBundle;
2625     BundleMember->NextInBundle = nullptr;
2626     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2627     if (BundleMember->UnscheduledDepsInBundle == 0) {
2628       ReadyInsts.insert(BundleMember);
2629     }
2630     BundleMember = Next;
2631   }
2632 }
2633 
extendSchedulingRegion(Value * V)2634 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2635   if (getScheduleData(V))
2636     return;
2637   Instruction *I = dyn_cast<Instruction>(V);
2638   assert(I && "bundle member must be an instruction");
2639   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2640   if (!ScheduleStart) {
2641     // It's the first instruction in the new region.
2642     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2643     ScheduleStart = I;
2644     ScheduleEnd = I->getNextNode();
2645     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2646     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
2647     return;
2648   }
2649   // Search up and down at the same time, because we don't know if the new
2650   // instruction is above or below the existing scheduling region.
2651   BasicBlock::reverse_iterator UpIter(ScheduleStart);
2652   BasicBlock::reverse_iterator UpperEnd = BB->rend();
2653   BasicBlock::iterator DownIter(ScheduleEnd);
2654   BasicBlock::iterator LowerEnd = BB->end();
2655   for (;;) {
2656     if (UpIter != UpperEnd) {
2657       if (&*UpIter == I) {
2658         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2659         ScheduleStart = I;
2660         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
2661         return;
2662       }
2663       UpIter++;
2664     }
2665     if (DownIter != LowerEnd) {
2666       if (&*DownIter == I) {
2667         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2668                          nullptr);
2669         ScheduleEnd = I->getNextNode();
2670         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2671         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
2672         return;
2673       }
2674       DownIter++;
2675     }
2676     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2677            "instruction not found in block");
2678   }
2679 }
2680 
initScheduleData(Instruction * FromI,Instruction * ToI,ScheduleData * PrevLoadStore,ScheduleData * NextLoadStore)2681 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2682                                                 Instruction *ToI,
2683                                                 ScheduleData *PrevLoadStore,
2684                                                 ScheduleData *NextLoadStore) {
2685   ScheduleData *CurrentLoadStore = PrevLoadStore;
2686   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2687     ScheduleData *SD = ScheduleDataMap[I];
2688     if (!SD) {
2689       // Allocate a new ScheduleData for the instruction.
2690       if (ChunkPos >= ChunkSize) {
2691         ScheduleDataChunks.push_back(
2692             llvm::make_unique<ScheduleData[]>(ChunkSize));
2693         ChunkPos = 0;
2694       }
2695       SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2696       ScheduleDataMap[I] = SD;
2697       SD->Inst = I;
2698     }
2699     assert(!isInSchedulingRegion(SD) &&
2700            "new ScheduleData already in scheduling region");
2701     SD->init(SchedulingRegionID);
2702 
2703     if (I->mayReadOrWriteMemory()) {
2704       // Update the linked list of memory accessing instructions.
2705       if (CurrentLoadStore) {
2706         CurrentLoadStore->NextLoadStore = SD;
2707       } else {
2708         FirstLoadStoreInRegion = SD;
2709       }
2710       CurrentLoadStore = SD;
2711     }
2712   }
2713   if (NextLoadStore) {
2714     if (CurrentLoadStore)
2715       CurrentLoadStore->NextLoadStore = NextLoadStore;
2716   } else {
2717     LastLoadStoreInRegion = CurrentLoadStore;
2718   }
2719 }
2720 
calculateDependencies(ScheduleData * SD,bool InsertInReadyList,BoUpSLP * SLP)2721 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2722                                                      bool InsertInReadyList,
2723                                                      BoUpSLP *SLP) {
2724   assert(SD->isSchedulingEntity());
2725 
2726   SmallVector<ScheduleData *, 10> WorkList;
2727   WorkList.push_back(SD);
2728 
2729   while (!WorkList.empty()) {
2730     ScheduleData *SD = WorkList.back();
2731     WorkList.pop_back();
2732 
2733     ScheduleData *BundleMember = SD;
2734     while (BundleMember) {
2735       assert(isInSchedulingRegion(BundleMember));
2736       if (!BundleMember->hasValidDependencies()) {
2737 
2738         DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
2739         BundleMember->Dependencies = 0;
2740         BundleMember->resetUnscheduledDeps();
2741 
2742         // Handle def-use chain dependencies.
2743         for (User *U : BundleMember->Inst->users()) {
2744           if (isa<Instruction>(U)) {
2745             ScheduleData *UseSD = getScheduleData(U);
2746             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2747               BundleMember->Dependencies++;
2748               ScheduleData *DestBundle = UseSD->FirstInBundle;
2749               if (!DestBundle->IsScheduled) {
2750                 BundleMember->incrementUnscheduledDeps(1);
2751               }
2752               if (!DestBundle->hasValidDependencies()) {
2753                 WorkList.push_back(DestBundle);
2754               }
2755             }
2756           } else {
2757             // I'm not sure if this can ever happen. But we need to be safe.
2758             // This lets the instruction/bundle never be scheduled and eventally
2759             // disable vectorization.
2760             BundleMember->Dependencies++;
2761             BundleMember->incrementUnscheduledDeps(1);
2762           }
2763         }
2764 
2765         // Handle the memory dependencies.
2766         ScheduleData *DepDest = BundleMember->NextLoadStore;
2767         if (DepDest) {
2768           Instruction *SrcInst = BundleMember->Inst;
2769           AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA);
2770           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2771 
2772           while (DepDest) {
2773             assert(isInSchedulingRegion(DepDest));
2774             if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
2775               if (SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)) {
2776                 DepDest->MemoryDependencies.push_back(BundleMember);
2777                 BundleMember->Dependencies++;
2778                 ScheduleData *DestBundle = DepDest->FirstInBundle;
2779                 if (!DestBundle->IsScheduled) {
2780                   BundleMember->incrementUnscheduledDeps(1);
2781                 }
2782                 if (!DestBundle->hasValidDependencies()) {
2783                   WorkList.push_back(DestBundle);
2784                 }
2785               }
2786             }
2787             DepDest = DepDest->NextLoadStore;
2788           }
2789         }
2790       }
2791       BundleMember = BundleMember->NextInBundle;
2792     }
2793     if (InsertInReadyList && SD->isReady()) {
2794       ReadyInsts.push_back(SD);
2795       DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
2796     }
2797   }
2798 }
2799 
resetSchedule()2800 void BoUpSLP::BlockScheduling::resetSchedule() {
2801   assert(ScheduleStart &&
2802          "tried to reset schedule on block which has not been scheduled");
2803   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2804     ScheduleData *SD = getScheduleData(I);
2805     assert(isInSchedulingRegion(SD));
2806     SD->IsScheduled = false;
2807     SD->resetUnscheduledDeps();
2808   }
2809   ReadyInsts.clear();
2810 }
2811 
scheduleBlock(BlockScheduling * BS)2812 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2813 
2814   if (!BS->ScheduleStart)
2815     return;
2816 
2817   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
2818 
2819   BS->resetSchedule();
2820 
2821   // For the real scheduling we use a more sophisticated ready-list: it is
2822   // sorted by the original instruction location. This lets the final schedule
2823   // be as  close as possible to the original instruction order.
2824   struct ScheduleDataCompare {
2825     bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
2826       return SD2->SchedulingPriority < SD1->SchedulingPriority;
2827     }
2828   };
2829   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
2830 
2831   // Ensure that all depencency data is updated and fill the ready-list with
2832   // initial instructions.
2833   int Idx = 0;
2834   int NumToSchedule = 0;
2835   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
2836        I = I->getNextNode()) {
2837     ScheduleData *SD = BS->getScheduleData(I);
2838     assert(
2839         SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
2840         "scheduler and vectorizer have different opinion on what is a bundle");
2841     SD->FirstInBundle->SchedulingPriority = Idx++;
2842     if (SD->isSchedulingEntity()) {
2843       BS->calculateDependencies(SD, false, this);
2844       NumToSchedule++;
2845     }
2846   }
2847   BS->initialFillReadyList(ReadyInsts);
2848 
2849   Instruction *LastScheduledInst = BS->ScheduleEnd;
2850 
2851   // Do the "real" scheduling.
2852   while (!ReadyInsts.empty()) {
2853     ScheduleData *picked = *ReadyInsts.begin();
2854     ReadyInsts.erase(ReadyInsts.begin());
2855 
2856     // Move the scheduled instruction(s) to their dedicated places, if not
2857     // there yet.
2858     ScheduleData *BundleMember = picked;
2859     while (BundleMember) {
2860       Instruction *pickedInst = BundleMember->Inst;
2861       if (LastScheduledInst->getNextNode() != pickedInst) {
2862         BS->BB->getInstList().remove(pickedInst);
2863         BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
2864       }
2865       LastScheduledInst = pickedInst;
2866       BundleMember = BundleMember->NextInBundle;
2867     }
2868 
2869     BS->schedule(picked, ReadyInsts);
2870     NumToSchedule--;
2871   }
2872   assert(NumToSchedule == 0 && "could not schedule all instructions");
2873 
2874   // Avoid duplicate scheduling of the block.
2875   BS->ScheduleStart = nullptr;
2876 }
2877 
2878 /// The SLPVectorizer Pass.
2879 struct SLPVectorizer : public FunctionPass {
2880   typedef SmallVector<StoreInst *, 8> StoreList;
2881   typedef MapVector<Value *, StoreList> StoreListMap;
2882 
2883   /// Pass identification, replacement for typeid
2884   static char ID;
2885 
SLPVectorizer__anon6868a6e70111::SLPVectorizer2886   explicit SLPVectorizer() : FunctionPass(ID) {
2887     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
2888   }
2889 
2890   ScalarEvolution *SE;
2891   const DataLayout *DL;
2892   TargetTransformInfo *TTI;
2893   TargetLibraryInfo *TLI;
2894   AliasAnalysis *AA;
2895   LoopInfo *LI;
2896   DominatorTree *DT;
2897   AssumptionCache *AC;
2898 
runOnFunction__anon6868a6e70111::SLPVectorizer2899   bool runOnFunction(Function &F) override {
2900     if (skipOptnoneFunction(F))
2901       return false;
2902 
2903     SE = &getAnalysis<ScalarEvolution>();
2904     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
2905     DL = DLP ? &DLP->getDataLayout() : nullptr;
2906     TTI = &getAnalysis<TargetTransformInfo>();
2907     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
2908     AA = &getAnalysis<AliasAnalysis>();
2909     LI = &getAnalysis<LoopInfo>();
2910     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2911     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2912 
2913     StoreRefs.clear();
2914     bool Changed = false;
2915 
2916     // If the target claims to have no vector registers don't attempt
2917     // vectorization.
2918     if (!TTI->getNumberOfRegisters(true))
2919       return false;
2920 
2921     // Must have DataLayout. We can't require it because some tests run w/o
2922     // triple.
2923     if (!DL)
2924       return false;
2925 
2926     // Don't vectorize when the attribute NoImplicitFloat is used.
2927     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
2928       return false;
2929 
2930     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
2931 
2932     // Use the bottom up slp vectorizer to construct chains that start with
2933     // store instructions.
2934     BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC);
2935 
2936     // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
2937     // delete instructions.
2938 
2939     // Scan the blocks in the function in post order.
2940     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
2941          e = po_end(&F.getEntryBlock()); it != e; ++it) {
2942       BasicBlock *BB = *it;
2943       // Vectorize trees that end at stores.
2944       if (unsigned count = collectStores(BB, R)) {
2945         (void)count;
2946         DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
2947         Changed |= vectorizeStoreChains(R);
2948       }
2949 
2950       // Vectorize trees that end at reductions.
2951       Changed |= vectorizeChainsInBlock(BB, R);
2952     }
2953 
2954     if (Changed) {
2955       R.optimizeGatherSequence();
2956       DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
2957       DEBUG(verifyFunction(F));
2958     }
2959     return Changed;
2960   }
2961 
getAnalysisUsage__anon6868a6e70111::SLPVectorizer2962   void getAnalysisUsage(AnalysisUsage &AU) const override {
2963     FunctionPass::getAnalysisUsage(AU);
2964     AU.addRequired<AssumptionCacheTracker>();
2965     AU.addRequired<ScalarEvolution>();
2966     AU.addRequired<AliasAnalysis>();
2967     AU.addRequired<TargetTransformInfo>();
2968     AU.addRequired<LoopInfo>();
2969     AU.addRequired<DominatorTreeWrapperPass>();
2970     AU.addPreserved<LoopInfo>();
2971     AU.addPreserved<DominatorTreeWrapperPass>();
2972     AU.setPreservesCFG();
2973   }
2974 
2975 private:
2976 
2977   /// \brief Collect memory references and sort them according to their base
2978   /// object. We sort the stores to their base objects to reduce the cost of the
2979   /// quadratic search on the stores. TODO: We can further reduce this cost
2980   /// if we flush the chain creation every time we run into a memory barrier.
2981   unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
2982 
2983   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
2984   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
2985 
2986   /// \brief Try to vectorize a list of operands.
2987   /// \@param BuildVector A list of users to ignore for the purpose of
2988   ///                     scheduling and that don't need extracting.
2989   /// \returns true if a value was vectorized.
2990   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
2991                           ArrayRef<Value *> BuildVector = None,
2992                           bool allowReorder = false);
2993 
2994   /// \brief Try to vectorize a chain that may start at the operands of \V;
2995   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
2996 
2997   /// \brief Vectorize the stores that were collected in StoreRefs.
2998   bool vectorizeStoreChains(BoUpSLP &R);
2999 
3000   /// \brief Scan the basic block and look for patterns that are likely to start
3001   /// a vectorization chain.
3002   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
3003 
3004   bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
3005                            BoUpSLP &R);
3006 
3007   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
3008                        BoUpSLP &R);
3009 private:
3010   StoreListMap StoreRefs;
3011 };
3012 
3013 /// \brief Check that the Values in the slice in VL array are still existent in
3014 /// the WeakVH array.
3015 /// Vectorization of part of the VL array may cause later values in the VL array
3016 /// to become invalid. We track when this has happened in the WeakVH array.
hasValueBeenRAUWed(ArrayRef<Value * > & VL,SmallVectorImpl<WeakVH> & VH,unsigned SliceBegin,unsigned SliceSize)3017 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
3018                                SmallVectorImpl<WeakVH> &VH,
3019                                unsigned SliceBegin,
3020                                unsigned SliceSize) {
3021   for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
3022     if (VH[i] != VL[i])
3023       return true;
3024 
3025   return false;
3026 }
3027 
vectorizeStoreChain(ArrayRef<Value * > Chain,int CostThreshold,BoUpSLP & R)3028 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
3029                                           int CostThreshold, BoUpSLP &R) {
3030   unsigned ChainLen = Chain.size();
3031   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3032         << "\n");
3033   Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
3034   unsigned Sz = DL->getTypeSizeInBits(StoreTy);
3035   unsigned VF = MinVecRegSize / Sz;
3036 
3037   if (!isPowerOf2_32(Sz) || VF < 2)
3038     return false;
3039 
3040   // Keep track of values that were deleted by vectorizing in the loop below.
3041   SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3042 
3043   bool Changed = false;
3044   // Look for profitable vectorizable trees at all offsets, starting at zero.
3045   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3046     if (i + VF > e)
3047       break;
3048 
3049     // Check that a previous iteration of this loop did not delete the Value.
3050     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3051       continue;
3052 
3053     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3054           << "\n");
3055     ArrayRef<Value *> Operands = Chain.slice(i, VF);
3056 
3057     R.buildTree(Operands);
3058 
3059     int Cost = R.getTreeCost();
3060 
3061     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3062     if (Cost < CostThreshold) {
3063       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3064       R.vectorizeTree();
3065 
3066       // Move to the next bundle.
3067       i += VF - 1;
3068       Changed = true;
3069     }
3070   }
3071 
3072   return Changed;
3073 }
3074 
vectorizeStores(ArrayRef<StoreInst * > Stores,int costThreshold,BoUpSLP & R)3075 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3076                                     int costThreshold, BoUpSLP &R) {
3077   SetVector<Value *> Heads, Tails;
3078   SmallDenseMap<Value *, Value *> ConsecutiveChain;
3079 
3080   // We may run into multiple chains that merge into a single chain. We mark the
3081   // stores that we vectorized so that we don't visit the same store twice.
3082   BoUpSLP::ValueSet VectorizedStores;
3083   bool Changed = false;
3084 
3085   // Do a quadratic search on all of the given stores and find
3086   // all of the pairs of stores that follow each other.
3087   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3088     for (unsigned j = 0; j < e; ++j) {
3089       if (i == j)
3090         continue;
3091 
3092       if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
3093         Tails.insert(Stores[j]);
3094         Heads.insert(Stores[i]);
3095         ConsecutiveChain[Stores[i]] = Stores[j];
3096       }
3097     }
3098   }
3099 
3100   // For stores that start but don't end a link in the chain:
3101   for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
3102        it != e; ++it) {
3103     if (Tails.count(*it))
3104       continue;
3105 
3106     // We found a store instr that starts a chain. Now follow the chain and try
3107     // to vectorize it.
3108     BoUpSLP::ValueList Operands;
3109     Value *I = *it;
3110     // Collect the chain into a list.
3111     while (Tails.count(I) || Heads.count(I)) {
3112       if (VectorizedStores.count(I))
3113         break;
3114       Operands.push_back(I);
3115       // Move to the next value in the chain.
3116       I = ConsecutiveChain[I];
3117     }
3118 
3119     bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
3120 
3121     // Mark the vectorized stores so that we don't vectorize them again.
3122     if (Vectorized)
3123       VectorizedStores.insert(Operands.begin(), Operands.end());
3124     Changed |= Vectorized;
3125   }
3126 
3127   return Changed;
3128 }
3129 
3130 
collectStores(BasicBlock * BB,BoUpSLP & R)3131 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
3132   unsigned count = 0;
3133   StoreRefs.clear();
3134   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
3135     StoreInst *SI = dyn_cast<StoreInst>(it);
3136     if (!SI)
3137       continue;
3138 
3139     // Don't touch volatile stores.
3140     if (!SI->isSimple())
3141       continue;
3142 
3143     // Check that the pointer points to scalars.
3144     Type *Ty = SI->getValueOperand()->getType();
3145     if (!isValidElementType(Ty))
3146       continue;
3147 
3148     // Find the base pointer.
3149     Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
3150 
3151     // Save the store locations.
3152     StoreRefs[Ptr].push_back(SI);
3153     count++;
3154   }
3155   return count;
3156 }
3157 
tryToVectorizePair(Value * A,Value * B,BoUpSLP & R)3158 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3159   if (!A || !B)
3160     return false;
3161   Value *VL[] = { A, B };
3162   return tryToVectorizeList(VL, R, None, true);
3163 }
3164 
tryToVectorizeList(ArrayRef<Value * > VL,BoUpSLP & R,ArrayRef<Value * > BuildVector,bool allowReorder)3165 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3166                                        ArrayRef<Value *> BuildVector,
3167                                        bool allowReorder) {
3168   if (VL.size() < 2)
3169     return false;
3170 
3171   DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3172 
3173   // Check that all of the parts are scalar instructions of the same type.
3174   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3175   if (!I0)
3176     return false;
3177 
3178   unsigned Opcode0 = I0->getOpcode();
3179 
3180   Type *Ty0 = I0->getType();
3181   unsigned Sz = DL->getTypeSizeInBits(Ty0);
3182   unsigned VF = MinVecRegSize / Sz;
3183 
3184   for (int i = 0, e = VL.size(); i < e; ++i) {
3185     Type *Ty = VL[i]->getType();
3186     if (!isValidElementType(Ty))
3187       return false;
3188     Instruction *Inst = dyn_cast<Instruction>(VL[i]);
3189     if (!Inst || Inst->getOpcode() != Opcode0)
3190       return false;
3191   }
3192 
3193   bool Changed = false;
3194 
3195   // Keep track of values that were deleted by vectorizing in the loop below.
3196   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3197 
3198   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3199     unsigned OpsWidth = 0;
3200 
3201     if (i + VF > e)
3202       OpsWidth = e - i;
3203     else
3204       OpsWidth = VF;
3205 
3206     if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3207       break;
3208 
3209     // Check that a previous iteration of this loop did not delete the Value.
3210     if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3211       continue;
3212 
3213     DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3214                  << "\n");
3215     ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3216 
3217     ArrayRef<Value *> BuildVectorSlice;
3218     if (!BuildVector.empty())
3219       BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3220 
3221     R.buildTree(Ops, BuildVectorSlice);
3222     // TODO: check if we can allow reordering also for other cases than
3223     // tryToVectorizePair()
3224     if (allowReorder && R.shouldReorder()) {
3225       assert(Ops.size() == 2);
3226       assert(BuildVectorSlice.empty());
3227       Value *ReorderedOps[] = { Ops[1], Ops[0] };
3228       R.buildTree(ReorderedOps, None);
3229     }
3230     int Cost = R.getTreeCost();
3231 
3232     if (Cost < -SLPCostThreshold) {
3233       DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3234       Value *VectorizedRoot = R.vectorizeTree();
3235 
3236       // Reconstruct the build vector by extracting the vectorized root. This
3237       // way we handle the case where some elements of the vector are undefined.
3238       //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3239       if (!BuildVectorSlice.empty()) {
3240         // The insert point is the last build vector instruction. The vectorized
3241         // root will precede it. This guarantees that we get an instruction. The
3242         // vectorized tree could have been constant folded.
3243         Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3244         unsigned VecIdx = 0;
3245         for (auto &V : BuildVectorSlice) {
3246           IRBuilder<true, NoFolder> Builder(
3247               ++BasicBlock::iterator(InsertAfter));
3248           InsertElementInst *IE = cast<InsertElementInst>(V);
3249           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3250               VectorizedRoot, Builder.getInt32(VecIdx++)));
3251           IE->setOperand(1, Extract);
3252           IE->removeFromParent();
3253           IE->insertAfter(Extract);
3254           InsertAfter = IE;
3255         }
3256       }
3257       // Move to the next bundle.
3258       i += VF - 1;
3259       Changed = true;
3260     }
3261   }
3262 
3263   return Changed;
3264 }
3265 
tryToVectorize(BinaryOperator * V,BoUpSLP & R)3266 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3267   if (!V)
3268     return false;
3269 
3270   // Try to vectorize V.
3271   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3272     return true;
3273 
3274   BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3275   BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3276   // Try to skip B.
3277   if (B && B->hasOneUse()) {
3278     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3279     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3280     if (tryToVectorizePair(A, B0, R)) {
3281       return true;
3282     }
3283     if (tryToVectorizePair(A, B1, R)) {
3284       return true;
3285     }
3286   }
3287 
3288   // Try to skip A.
3289   if (A && A->hasOneUse()) {
3290     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3291     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3292     if (tryToVectorizePair(A0, B, R)) {
3293       return true;
3294     }
3295     if (tryToVectorizePair(A1, B, R)) {
3296       return true;
3297     }
3298   }
3299   return 0;
3300 }
3301 
3302 /// \brief Generate a shuffle mask to be used in a reduction tree.
3303 ///
3304 /// \param VecLen The length of the vector to be reduced.
3305 /// \param NumEltsToRdx The number of elements that should be reduced in the
3306 ///        vector.
3307 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3308 ///        reduction. A pairwise reduction will generate a mask of
3309 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
3310 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3311 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
createRdxShuffleMask(unsigned VecLen,unsigned NumEltsToRdx,bool IsPairwise,bool IsLeft,IRBuilder<> & Builder)3312 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3313                                    bool IsPairwise, bool IsLeft,
3314                                    IRBuilder<> &Builder) {
3315   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3316 
3317   SmallVector<Constant *, 32> ShuffleMask(
3318       VecLen, UndefValue::get(Builder.getInt32Ty()));
3319 
3320   if (IsPairwise)
3321     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3322     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3323       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3324   else
3325     // Move the upper half of the vector to the lower half.
3326     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3327       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3328 
3329   return ConstantVector::get(ShuffleMask);
3330 }
3331 
3332 
3333 /// Model horizontal reductions.
3334 ///
3335 /// A horizontal reduction is a tree of reduction operations (currently add and
3336 /// fadd) that has operations that can be put into a vector as its leaf.
3337 /// For example, this tree:
3338 ///
3339 /// mul mul mul mul
3340 ///  \  /    \  /
3341 ///   +       +
3342 ///    \     /
3343 ///       +
3344 /// This tree has "mul" as its reduced values and "+" as its reduction
3345 /// operations. A reduction might be feeding into a store or a binary operation
3346 /// feeding a phi.
3347 ///    ...
3348 ///    \  /
3349 ///     +
3350 ///     |
3351 ///  phi +=
3352 ///
3353 ///  Or:
3354 ///    ...
3355 ///    \  /
3356 ///     +
3357 ///     |
3358 ///   *p =
3359 ///
3360 class HorizontalReduction {
3361   SmallVector<Value *, 16> ReductionOps;
3362   SmallVector<Value *, 32> ReducedVals;
3363 
3364   BinaryOperator *ReductionRoot;
3365   PHINode *ReductionPHI;
3366 
3367   /// The opcode of the reduction.
3368   unsigned ReductionOpcode;
3369   /// The opcode of the values we perform a reduction on.
3370   unsigned ReducedValueOpcode;
3371   /// The width of one full horizontal reduction operation.
3372   unsigned ReduxWidth;
3373   /// Should we model this reduction as a pairwise reduction tree or a tree that
3374   /// splits the vector in halves and adds those halves.
3375   bool IsPairwiseReduction;
3376 
3377 public:
HorizontalReduction()3378   HorizontalReduction()
3379     : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3380     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3381 
3382   /// \brief Try to find a reduction tree.
matchAssociativeReduction(PHINode * Phi,BinaryOperator * B,const DataLayout * DL)3383   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
3384                                  const DataLayout *DL) {
3385     assert((!Phi ||
3386             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3387            "Thi phi needs to use the binary operator");
3388 
3389     // We could have a initial reductions that is not an add.
3390     //  r *= v1 + v2 + v3 + v4
3391     // In such a case start looking for a tree rooted in the first '+'.
3392     if (Phi) {
3393       if (B->getOperand(0) == Phi) {
3394         Phi = nullptr;
3395         B = dyn_cast<BinaryOperator>(B->getOperand(1));
3396       } else if (B->getOperand(1) == Phi) {
3397         Phi = nullptr;
3398         B = dyn_cast<BinaryOperator>(B->getOperand(0));
3399       }
3400     }
3401 
3402     if (!B)
3403       return false;
3404 
3405     Type *Ty = B->getType();
3406     if (!isValidElementType(Ty))
3407       return false;
3408 
3409     ReductionOpcode = B->getOpcode();
3410     ReducedValueOpcode = 0;
3411     ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
3412     ReductionRoot = B;
3413     ReductionPHI = Phi;
3414 
3415     if (ReduxWidth < 4)
3416       return false;
3417 
3418     // We currently only support adds.
3419     if (ReductionOpcode != Instruction::Add &&
3420         ReductionOpcode != Instruction::FAdd)
3421       return false;
3422 
3423     // Post order traverse the reduction tree starting at B. We only handle true
3424     // trees containing only binary operators.
3425     SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
3426     Stack.push_back(std::make_pair(B, 0));
3427     while (!Stack.empty()) {
3428       BinaryOperator *TreeN = Stack.back().first;
3429       unsigned EdgeToVist = Stack.back().second++;
3430       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3431 
3432       // Only handle trees in the current basic block.
3433       if (TreeN->getParent() != B->getParent())
3434         return false;
3435 
3436       // Each tree node needs to have one user except for the ultimate
3437       // reduction.
3438       if (!TreeN->hasOneUse() && TreeN != B)
3439         return false;
3440 
3441       // Postorder vist.
3442       if (EdgeToVist == 2 || IsReducedValue) {
3443         if (IsReducedValue) {
3444           // Make sure that the opcodes of the operations that we are going to
3445           // reduce match.
3446           if (!ReducedValueOpcode)
3447             ReducedValueOpcode = TreeN->getOpcode();
3448           else if (ReducedValueOpcode != TreeN->getOpcode())
3449             return false;
3450           ReducedVals.push_back(TreeN);
3451         } else {
3452           // We need to be able to reassociate the adds.
3453           if (!TreeN->isAssociative())
3454             return false;
3455           ReductionOps.push_back(TreeN);
3456         }
3457         // Retract.
3458         Stack.pop_back();
3459         continue;
3460       }
3461 
3462       // Visit left or right.
3463       Value *NextV = TreeN->getOperand(EdgeToVist);
3464       BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
3465       if (Next)
3466         Stack.push_back(std::make_pair(Next, 0));
3467       else if (NextV != Phi)
3468         return false;
3469     }
3470     return true;
3471   }
3472 
3473   /// \brief Attempt to vectorize the tree found by
3474   /// matchAssociativeReduction.
tryToReduce(BoUpSLP & V,TargetTransformInfo * TTI)3475   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3476     if (ReducedVals.empty())
3477       return false;
3478 
3479     unsigned NumReducedVals = ReducedVals.size();
3480     if (NumReducedVals < ReduxWidth)
3481       return false;
3482 
3483     Value *VectorizedTree = nullptr;
3484     IRBuilder<> Builder(ReductionRoot);
3485     FastMathFlags Unsafe;
3486     Unsafe.setUnsafeAlgebra();
3487     Builder.SetFastMathFlags(Unsafe);
3488     unsigned i = 0;
3489 
3490     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3491       V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3492 
3493       // Estimate cost.
3494       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3495       if (Cost >= -SLPCostThreshold)
3496         break;
3497 
3498       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3499                    << ". (HorRdx)\n");
3500 
3501       // Vectorize a tree.
3502       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3503       Value *VectorizedRoot = V.vectorizeTree();
3504 
3505       // Emit a reduction.
3506       Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3507       if (VectorizedTree) {
3508         Builder.SetCurrentDebugLocation(Loc);
3509         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3510                                      ReducedSubTree, "bin.rdx");
3511       } else
3512         VectorizedTree = ReducedSubTree;
3513     }
3514 
3515     if (VectorizedTree) {
3516       // Finish the reduction.
3517       for (; i < NumReducedVals; ++i) {
3518         Builder.SetCurrentDebugLocation(
3519           cast<Instruction>(ReducedVals[i])->getDebugLoc());
3520         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3521                                      ReducedVals[i]);
3522       }
3523       // Update users.
3524       if (ReductionPHI) {
3525         assert(ReductionRoot && "Need a reduction operation");
3526         ReductionRoot->setOperand(0, VectorizedTree);
3527         ReductionRoot->setOperand(1, ReductionPHI);
3528       } else
3529         ReductionRoot->replaceAllUsesWith(VectorizedTree);
3530     }
3531     return VectorizedTree != nullptr;
3532   }
3533 
3534 private:
3535 
3536   /// \brief Calcuate the cost of a reduction.
getReductionCost(TargetTransformInfo * TTI,Value * FirstReducedVal)3537   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3538     Type *ScalarTy = FirstReducedVal->getType();
3539     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3540 
3541     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3542     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3543 
3544     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3545     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3546 
3547     int ScalarReduxCost =
3548         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3549 
3550     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3551                  << " for reduction that starts with " << *FirstReducedVal
3552                  << " (It is a "
3553                  << (IsPairwiseReduction ? "pairwise" : "splitting")
3554                  << " reduction)\n");
3555 
3556     return VecReduxCost - ScalarReduxCost;
3557   }
3558 
createBinOp(IRBuilder<> & Builder,unsigned Opcode,Value * L,Value * R,const Twine & Name="")3559   static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3560                             Value *R, const Twine &Name = "") {
3561     if (Opcode == Instruction::FAdd)
3562       return Builder.CreateFAdd(L, R, Name);
3563     return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3564   }
3565 
3566   /// \brief Emit a horizontal reduction of the vectorized value.
emitReduction(Value * VectorizedValue,IRBuilder<> & Builder)3567   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3568     assert(VectorizedValue && "Need to have a vectorized tree node");
3569     assert(isPowerOf2_32(ReduxWidth) &&
3570            "We only handle power-of-two reductions for now");
3571 
3572     Value *TmpVec = VectorizedValue;
3573     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3574       if (IsPairwiseReduction) {
3575         Value *LeftMask =
3576           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3577         Value *RightMask =
3578           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3579 
3580         Value *LeftShuf = Builder.CreateShuffleVector(
3581           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3582         Value *RightShuf = Builder.CreateShuffleVector(
3583           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3584           "rdx.shuf.r");
3585         TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3586                              "bin.rdx");
3587       } else {
3588         Value *UpperHalf =
3589           createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3590         Value *Shuf = Builder.CreateShuffleVector(
3591           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3592         TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3593       }
3594     }
3595 
3596     // The result is in the first element of the vector.
3597     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3598   }
3599 };
3600 
3601 /// \brief Recognize construction of vectors like
3602 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
3603 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
3604 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
3605 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
3606 ///
3607 /// Returns true if it matches
3608 ///
findBuildVector(InsertElementInst * FirstInsertElem,SmallVectorImpl<Value * > & BuildVector,SmallVectorImpl<Value * > & BuildVectorOpds)3609 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3610                             SmallVectorImpl<Value *> &BuildVector,
3611                             SmallVectorImpl<Value *> &BuildVectorOpds) {
3612   if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3613     return false;
3614 
3615   InsertElementInst *IE = FirstInsertElem;
3616   while (true) {
3617     BuildVector.push_back(IE);
3618     BuildVectorOpds.push_back(IE->getOperand(1));
3619 
3620     if (IE->use_empty())
3621       return false;
3622 
3623     InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
3624     if (!NextUse)
3625       return true;
3626 
3627     // If this isn't the final use, make sure the next insertelement is the only
3628     // use. It's OK if the final constructed vector is used multiple times
3629     if (!IE->hasOneUse())
3630       return false;
3631 
3632     IE = NextUse;
3633   }
3634 
3635   return false;
3636 }
3637 
PhiTypeSorterFunc(Value * V,Value * V2)3638 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3639   return V->getType() < V2->getType();
3640 }
3641 
vectorizeChainsInBlock(BasicBlock * BB,BoUpSLP & R)3642 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
3643   bool Changed = false;
3644   SmallVector<Value *, 4> Incoming;
3645   SmallSet<Value *, 16> VisitedInstrs;
3646 
3647   bool HaveVectorizedPhiNodes = true;
3648   while (HaveVectorizedPhiNodes) {
3649     HaveVectorizedPhiNodes = false;
3650 
3651     // Collect the incoming values from the PHIs.
3652     Incoming.clear();
3653     for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
3654          ++instr) {
3655       PHINode *P = dyn_cast<PHINode>(instr);
3656       if (!P)
3657         break;
3658 
3659       if (!VisitedInstrs.count(P))
3660         Incoming.push_back(P);
3661     }
3662 
3663     // Sort by type.
3664     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
3665 
3666     // Try to vectorize elements base on their type.
3667     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
3668                                            E = Incoming.end();
3669          IncIt != E;) {
3670 
3671       // Look for the next elements with the same type.
3672       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
3673       while (SameTypeIt != E &&
3674              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3675         VisitedInstrs.insert(*SameTypeIt);
3676         ++SameTypeIt;
3677       }
3678 
3679       // Try to vectorize them.
3680       unsigned NumElts = (SameTypeIt - IncIt);
3681       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
3682       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
3683         // Success start over because instructions might have been changed.
3684         HaveVectorizedPhiNodes = true;
3685         Changed = true;
3686         break;
3687       }
3688 
3689       // Start over at the next instruction of a different type (or the end).
3690       IncIt = SameTypeIt;
3691     }
3692   }
3693 
3694   VisitedInstrs.clear();
3695 
3696   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
3697     // We may go through BB multiple times so skip the one we have checked.
3698     if (!VisitedInstrs.insert(it).second)
3699       continue;
3700 
3701     if (isa<DbgInfoIntrinsic>(it))
3702       continue;
3703 
3704     // Try to vectorize reductions that use PHINodes.
3705     if (PHINode *P = dyn_cast<PHINode>(it)) {
3706       // Check that the PHI is a reduction PHI.
3707       if (P->getNumIncomingValues() != 2)
3708         return Changed;
3709       Value *Rdx =
3710           (P->getIncomingBlock(0) == BB
3711                ? (P->getIncomingValue(0))
3712                : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
3713                                                : nullptr));
3714       // Check if this is a Binary Operator.
3715       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
3716       if (!BI)
3717         continue;
3718 
3719       // Try to match and vectorize a horizontal reduction.
3720       HorizontalReduction HorRdx;
3721       if (ShouldVectorizeHor &&
3722           HorRdx.matchAssociativeReduction(P, BI, DL) &&
3723           HorRdx.tryToReduce(R, TTI)) {
3724         Changed = true;
3725         it = BB->begin();
3726         e = BB->end();
3727         continue;
3728       }
3729 
3730      Value *Inst = BI->getOperand(0);
3731       if (Inst == P)
3732         Inst = BI->getOperand(1);
3733 
3734       if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3735         // We would like to start over since some instructions are deleted
3736         // and the iterator may become invalid value.
3737         Changed = true;
3738         it = BB->begin();
3739         e = BB->end();
3740         continue;
3741       }
3742 
3743       continue;
3744     }
3745 
3746     // Try to vectorize horizontal reductions feeding into a store.
3747     if (ShouldStartVectorizeHorAtStore)
3748       if (StoreInst *SI = dyn_cast<StoreInst>(it))
3749         if (BinaryOperator *BinOp =
3750                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
3751           HorizontalReduction HorRdx;
3752           if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
3753                 HorRdx.tryToReduce(R, TTI)) ||
3754                tryToVectorize(BinOp, R))) {
3755             Changed = true;
3756             it = BB->begin();
3757             e = BB->end();
3758             continue;
3759           }
3760         }
3761 
3762     // Try to vectorize horizontal reductions feeding into a return.
3763     if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
3764       if (RI->getNumOperands() != 0)
3765         if (BinaryOperator *BinOp =
3766                 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
3767           DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
3768           if (tryToVectorizePair(BinOp->getOperand(0),
3769                                  BinOp->getOperand(1), R)) {
3770             Changed = true;
3771             it = BB->begin();
3772             e = BB->end();
3773             continue;
3774           }
3775         }
3776 
3777     // Try to vectorize trees that start at compare instructions.
3778     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
3779       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
3780         Changed = true;
3781         // We would like to start over since some instructions are deleted
3782         // and the iterator may become invalid value.
3783         it = BB->begin();
3784         e = BB->end();
3785         continue;
3786       }
3787 
3788       for (int i = 0; i < 2; ++i) {
3789         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
3790           if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
3791             Changed = true;
3792             // We would like to start over since some instructions are deleted
3793             // and the iterator may become invalid value.
3794             it = BB->begin();
3795             e = BB->end();
3796           }
3797         }
3798       }
3799       continue;
3800     }
3801 
3802     // Try to vectorize trees that start at insertelement instructions.
3803     if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
3804       SmallVector<Value *, 16> BuildVector;
3805       SmallVector<Value *, 16> BuildVectorOpds;
3806       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
3807         continue;
3808 
3809       // Vectorize starting with the build vector operands ignoring the
3810       // BuildVector instructions for the purpose of scheduling and user
3811       // extraction.
3812       if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
3813         Changed = true;
3814         it = BB->begin();
3815         e = BB->end();
3816       }
3817 
3818       continue;
3819     }
3820   }
3821 
3822   return Changed;
3823 }
3824 
vectorizeStoreChains(BoUpSLP & R)3825 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
3826   bool Changed = false;
3827   // Attempt to sort and vectorize each of the store-groups.
3828   for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
3829        it != e; ++it) {
3830     if (it->second.size() < 2)
3831       continue;
3832 
3833     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
3834           << it->second.size() << ".\n");
3835 
3836     // Process the stores in chunks of 16.
3837     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
3838       unsigned Len = std::min<unsigned>(CE - CI, 16);
3839       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
3840                                  -SLPCostThreshold, R);
3841     }
3842   }
3843   return Changed;
3844 }
3845 
3846 } // end anonymous namespace
3847 
3848 char SLPVectorizer::ID = 0;
3849 static const char lv_name[] = "SLP Vectorizer";
3850 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
3851 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
3852 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
3853 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
3854 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
3855 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
3856 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
3857 
3858 namespace llvm {
createSLPVectorizerPass()3859 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
3860 }
3861