xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp (revision 09e516c54b0c5ec3fe6b3ef6c70d9e09e89abc95)
1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// This file implements SLP analysis based on VPlan. The analysis is based on
10 /// the ideas described in
11 ///
12 ///   Look-ahead SLP: auto-vectorization in the presence of commutative
13 ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
14 ///   Luís F. W. Góes
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #include "VPlan.h"
19 #include "llvm/ADT/DepthFirstIterator.h"
20 #include "llvm/ADT/PostOrderIterator.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/Analysis/VectorUtils.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/CFG.h"
27 #include "llvm/IR/Dominators.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/IR/Value.h"
33 #include "llvm/Support/Casting.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/GraphWriter.h"
37 #include "llvm/Support/raw_ostream.h"
38 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
39 #include <cassert>
40 #include <iterator>
41 #include <string>
42 #include <vector>
43 
44 using namespace llvm;
45 
46 #define DEBUG_TYPE "vplan-slp"
47 
48 // Number of levels to look ahead when re-ordering multi node operands.
49 static unsigned LookaheadMaxDepth = 5;
50 
51 VPInstruction *VPlanSlp::markFailed() {
52   // FIXME: Currently this is used to signal we hit instructions we cannot
53   //        trivially SLP'ize.
54   CompletelySLP = false;
55   return nullptr;
56 }
57 
58 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
59   if (all_of(Operands, [](VPValue *V) {
60         return cast<VPInstruction>(V)->getUnderlyingInstr();
61       })) {
62     unsigned BundleSize = 0;
63     for (VPValue *V : Operands) {
64       Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
65       assert(!T->isVectorTy() && "Only scalar types supported for now");
66       BundleSize += T->getScalarSizeInBits();
67     }
68     WidestBundleBits = std::max(WidestBundleBits, BundleSize);
69   }
70 
71   auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
72   assert(Res.second &&
73          "Already created a combined instruction for the operand bundle");
74   (void)Res;
75 }
76 
77 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
78   // Currently we only support VPInstructions.
79   if (!all_of(Operands, [](VPValue *Op) {
80         return Op && isa<VPInstruction>(Op) &&
81                cast<VPInstruction>(Op)->getUnderlyingInstr();
82       })) {
83     LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
84     return false;
85   }
86 
87   // Check if opcodes and type width agree for all instructions in the bundle.
88   // FIXME: Differing widths/opcodes can be handled by inserting additional
89   //        instructions.
90   // FIXME: Deal with non-primitive types.
91   const Instruction *OriginalInstr =
92       cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
93   unsigned Opcode = OriginalInstr->getOpcode();
94   unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
95   if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
96         const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
97         return I->getOpcode() == Opcode &&
98                I->getType()->getPrimitiveSizeInBits() == Width;
99       })) {
100     LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
101     return false;
102   }
103 
104   // For now, all operands must be defined in the same BB.
105   if (any_of(Operands, [this](VPValue *Op) {
106         return cast<VPInstruction>(Op)->getParent() != &this->BB;
107       })) {
108     LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
109     return false;
110   }
111 
112   if (any_of(Operands,
113              [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
114     LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
115     return false;
116   }
117 
118   // For loads, check that there are no instructions writing to memory in
119   // between them.
120   // TODO: we only have to forbid instructions writing to memory that could
121   //       interfere with any of the loads in the bundle
122   if (Opcode == Instruction::Load) {
123     unsigned LoadsSeen = 0;
124     VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
125     for (auto &I : *Parent) {
126       auto *VPI = cast<VPInstruction>(&I);
127       if (VPI->getOpcode() == Instruction::Load &&
128           std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
129         LoadsSeen++;
130 
131       if (LoadsSeen == Operands.size())
132         break;
133       if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
134         LLVM_DEBUG(
135             dbgs() << "VPSLP: instruction modifying memory between loads\n");
136         return false;
137       }
138     }
139 
140     if (!all_of(Operands, [](VPValue *Op) {
141           return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
142               ->isSimple();
143         })) {
144       LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
145       return false;
146     }
147   }
148 
149   if (Opcode == Instruction::Store)
150     if (!all_of(Operands, [](VPValue *Op) {
151           return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
152               ->isSimple();
153         })) {
154       LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
155       return false;
156     }
157 
158   return true;
159 }
160 
161 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
162                                              unsigned OperandIndex) {
163   SmallVector<VPValue *, 4> Operands;
164   for (VPValue *V : Values) {
165     auto *U = cast<VPUser>(V);
166     Operands.push_back(U->getOperand(OperandIndex));
167   }
168   return Operands;
169 }
170 
171 static bool areCommutative(ArrayRef<VPValue *> Values) {
172   return Instruction::isCommutative(
173       cast<VPInstruction>(Values[0])->getOpcode());
174 }
175 
176 static SmallVector<SmallVector<VPValue *, 4>, 4>
177 getOperands(ArrayRef<VPValue *> Values) {
178   SmallVector<SmallVector<VPValue *, 4>, 4> Result;
179   auto *VPI = cast<VPInstruction>(Values[0]);
180 
181   switch (VPI->getOpcode()) {
182   case Instruction::Load:
183     llvm_unreachable("Loads terminate a tree, no need to get operands");
184   case Instruction::Store:
185     Result.push_back(getOperands(Values, 0));
186     break;
187   default:
188     for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
189       Result.push_back(getOperands(Values, I));
190     break;
191   }
192 
193   return Result;
194 }
195 
196 /// Returns the opcode of Values or ~0 if they do not all agree.
197 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
198   unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
199   if (any_of(Values, [Opcode](VPValue *V) {
200         return cast<VPInstruction>(V)->getOpcode() != Opcode;
201       }))
202     return None;
203   return {Opcode};
204 }
205 
206 /// Returns true if A and B access sequential memory if they are loads or
207 /// stores or if they have identical opcodes otherwise.
208 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
209                                   VPInterleavedAccessInfo &IAI) {
210   if (A->getOpcode() != B->getOpcode())
211     return false;
212 
213   if (A->getOpcode() != Instruction::Load &&
214       A->getOpcode() != Instruction::Store)
215     return true;
216   auto *GA = IAI.getInterleaveGroup(A);
217   auto *GB = IAI.getInterleaveGroup(B);
218 
219   return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
220 }
221 
222 /// Implements getLAScore from Listing 7 in the paper.
223 /// Traverses and compares operands of V1 and V2 to MaxLevel.
224 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
225                            VPInterleavedAccessInfo &IAI) {
226   if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
227     return 0;
228 
229   if (MaxLevel == 0)
230     return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
231                                            cast<VPInstruction>(V2), IAI);
232 
233   unsigned Score = 0;
234   for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
235     for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
236       Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
237                           cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
238   return Score;
239 }
240 
241 std::pair<VPlanSlp::OpMode, VPValue *>
242 VPlanSlp::getBest(OpMode Mode, VPValue *Last,
243                   SmallVectorImpl<VPValue *> &Candidates,
244                   VPInterleavedAccessInfo &IAI) {
245   LLVM_DEBUG(dbgs() << "      getBest\n");
246   VPValue *Best = Candidates[0];
247   SmallVector<VPValue *, 4> BestCandidates;
248 
249   LLVM_DEBUG(dbgs() << "        Candidates  for "
250                     << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
251   for (auto *Candidate : Candidates) {
252     auto *LastI = cast<VPInstruction>(Last);
253     auto *CandidateI = cast<VPInstruction>(Candidate);
254     if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
255       LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
256                         << " ");
257       BestCandidates.push_back(Candidate);
258     }
259   }
260   LLVM_DEBUG(dbgs() << "\n");
261 
262   if (BestCandidates.empty())
263     return {OpMode::Failed, nullptr};
264 
265   if (BestCandidates.size() == 1)
266     return {Mode, BestCandidates[0]};
267 
268   if (Mode == OpMode::Opcode) {
269     unsigned BestScore = 0;
270     for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
271       unsigned PrevScore = ~0u;
272       bool AllSame = true;
273 
274       // FIXME: Avoid visiting the same operands multiple times.
275       for (auto *Candidate : BestCandidates) {
276         unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
277         if (PrevScore == ~0u)
278           PrevScore = Score;
279         if (PrevScore != Score)
280           AllSame = false;
281         PrevScore = Score;
282 
283         if (Score > BestScore) {
284           BestScore = Score;
285           Best = Candidate;
286         }
287       }
288       if (!AllSame)
289         break;
290     }
291   }
292   LLVM_DEBUG(dbgs() << "Found best "
293                     << *cast<VPInstruction>(Best)->getUnderlyingInstr()
294                     << "\n");
295   std::remove(Candidates.begin(), Candidates.end(), Best);
296 
297   return {Mode, Best};
298 }
299 
300 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
301   SmallVector<MultiNodeOpTy, 4> FinalOrder;
302   SmallVector<OpMode, 4> Mode;
303   FinalOrder.reserve(MultiNodeOps.size());
304   Mode.reserve(MultiNodeOps.size());
305 
306   LLVM_DEBUG(dbgs() << "Reordering multinode\n");
307 
308   for (auto &Operands : MultiNodeOps) {
309     FinalOrder.push_back({Operands.first, {Operands.second[0]}});
310     if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
311         Instruction::Load)
312       Mode.push_back(OpMode::Load);
313     else
314       Mode.push_back(OpMode::Opcode);
315   }
316 
317   for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
318     LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
319     SmallVector<VPValue *, 4> Candidates;
320     Candidates.reserve(MultiNodeOps.size());
321     LLVM_DEBUG(dbgs() << "  Candidates  ");
322     for (auto Ops : MultiNodeOps) {
323       LLVM_DEBUG(
324           dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
325                  << " ");
326       Candidates.push_back(Ops.second[Lane]);
327     }
328     LLVM_DEBUG(dbgs() << "\n");
329 
330     for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
331       LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
332       if (Mode[Op] == OpMode::Failed)
333         continue;
334 
335       VPValue *Last = FinalOrder[Op].second[Lane - 1];
336       std::pair<OpMode, VPValue *> Res =
337           getBest(Mode[Op], Last, Candidates, IAI);
338       if (Res.second)
339         FinalOrder[Op].second.push_back(Res.second);
340       else
341         // TODO: handle this case
342         FinalOrder[Op].second.push_back(markFailed());
343     }
344   }
345 
346   return FinalOrder;
347 }
348 
349 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
350   LLVM_DEBUG(dbgs() << " Ops: ");
351   for (auto Op : Values)
352     if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
353       LLVM_DEBUG(dbgs() << *Instr << " | ");
354     else
355       LLVM_DEBUG(dbgs() << " nullptr | ");
356   LLVM_DEBUG(dbgs() << "\n");
357 }
358 
359 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
360   assert(!Values.empty() && "Need some operands!");
361 
362   // If we already visited this instruction bundle, re-use the existing node
363   auto I = BundleToCombined.find(to_vector<4>(Values));
364   if (I != BundleToCombined.end()) {
365 #ifdef NDEBUG
366     // Check that the resulting graph is a tree. If we re-use a node, this means
367     // its values have multiple users. We only allow this, if all users of each
368     // value are the same instruction.
369     for (auto *V : Values) {
370       auto UI = V->user_begin();
371       auto *FirstUser = *UI++;
372       while (UI != V->use_end()) {
373         assert(*UI == FirstUser && "Currently we only support SLP trees.");
374         UI++;
375       }
376     }
377 #endif
378     return I->second;
379   }
380 
381   // Dump inputs
382   LLVM_DEBUG({
383     dbgs() << "buildGraph: ";
384     dumpBundle(Values);
385   });
386 
387   if (!areVectorizable(Values))
388     return markFailed();
389 
390   assert(getOpcode(Values) && "Opcodes for all values must match");
391   unsigned ValuesOpcode = getOpcode(Values).getValue();
392 
393   SmallVector<VPValue *, 4> CombinedOperands;
394   if (areCommutative(Values)) {
395     bool MultiNodeRoot = !MultiNodeActive;
396     MultiNodeActive = true;
397     for (auto &Operands : getOperands(Values)) {
398       LLVM_DEBUG({
399         dbgs() << "  Visiting Commutative";
400         dumpBundle(Operands);
401       });
402 
403       auto OperandsOpcode = getOpcode(Operands);
404       if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
405         LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
406         CombinedOperands.push_back(buildGraph(Operands));
407       } else {
408         LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
409         // Create dummy VPInstruction, which will we replace later by the
410         // re-ordered operand.
411         VPInstruction *Op = new VPInstruction(0, {});
412         CombinedOperands.push_back(Op);
413         MultiNodeOps.emplace_back(Op, Operands);
414       }
415     }
416 
417     if (MultiNodeRoot) {
418       LLVM_DEBUG(dbgs() << "Reorder \n");
419       MultiNodeActive = false;
420 
421       auto FinalOrder = reorderMultiNodeOps();
422 
423       MultiNodeOps.clear();
424       for (auto &Ops : FinalOrder) {
425         VPInstruction *NewOp = buildGraph(Ops.second);
426         Ops.first->replaceAllUsesWith(NewOp);
427         for (unsigned i = 0; i < CombinedOperands.size(); i++)
428           if (CombinedOperands[i] == Ops.first)
429             CombinedOperands[i] = NewOp;
430         delete Ops.first;
431         Ops.first = NewOp;
432       }
433       LLVM_DEBUG(dbgs() << "Found final order\n");
434     }
435   } else {
436     LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
437     if (ValuesOpcode == Instruction::Load)
438       for (VPValue *V : Values)
439         CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
440     else
441       for (auto &Operands : getOperands(Values))
442         CombinedOperands.push_back(buildGraph(Operands));
443   }
444 
445   unsigned Opcode;
446   switch (ValuesOpcode) {
447   case Instruction::Load:
448     Opcode = VPInstruction::SLPLoad;
449     break;
450   case Instruction::Store:
451     Opcode = VPInstruction::SLPStore;
452     break;
453   default:
454     Opcode = ValuesOpcode;
455     break;
456   }
457 
458   if (!CompletelySLP)
459     return markFailed();
460 
461   assert(CombinedOperands.size() > 0 && "Need more some operands");
462   auto *VPI = new VPInstruction(Opcode, CombinedOperands);
463   VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
464 
465   LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
466              cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
467   addCombined(Values, VPI);
468   return VPI;
469 }
470