xref: /llvm-project/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h (revision abda8ce2ee2ad35af7f069fab851adaa4646d0ef)
1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// A Load/Store unit class that models load/store queues and that implements
11 /// a simple weak memory consistency model.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
16 #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
17 
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/MC/MCSchedule.h"
21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22 #include "llvm/MCA/Instruction.h"
23 
24 namespace llvm {
25 namespace mca {
26 
27 /// Abstract base interface for LS (load/store) units in llvm-mca.
28 class LSUnitBase : public HardwareUnit {
29   /// Load queue size.
30   ///
31   /// A value of zero for this field means that the load queue is unbounded.
32   /// Processor models can declare the size of a load queue via tablegen (see
33   /// the definition of tablegen class LoadQueue in
34   /// llvm/Target/TargetSchedule.td).
35   unsigned LQSize;
36 
37   /// Load queue size.
38   ///
39   /// A value of zero for this field means that the store queue is unbounded.
40   /// Processor models can declare the size of a store queue via tablegen (see
41   /// the definition of tablegen class StoreQueue in
42   /// llvm/Target/TargetSchedule.td).
43   unsigned SQSize;
44 
45   unsigned UsedLQEntries;
46   unsigned UsedSQEntries;
47 
48   /// True if loads don't alias with stores.
49   ///
50   /// By default, the LS unit assumes that loads and stores don't alias with
51   /// each other. If this field is set to false, then loads are always assumed
52   /// to alias with stores.
53   const bool NoAlias;
54 
55 public:
56   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
57              unsigned StoreQueueSize, bool AssumeNoAlias);
58 
59   virtual ~LSUnitBase();
60 
61   /// Returns the total number of entries in the load queue.
62   unsigned getLoadQueueSize() const { return LQSize; }
63 
64   /// Returns the total number of entries in the store queue.
65   unsigned getStoreQueueSize() const { return SQSize; }
66 
67   unsigned getUsedLQEntries() const { return UsedLQEntries; }
68   unsigned getUsedSQEntries() const { return UsedSQEntries; }
69   void acquireLQSlot() { ++UsedLQEntries; }
70   void acquireSQSlot() { ++UsedSQEntries; }
71   void releaseLQSlot() { --UsedLQEntries; }
72   void releaseSQSlot() { --UsedSQEntries; }
73 
74   bool assumeNoAlias() const { return NoAlias; }
75 
76   enum Status {
77     LSU_AVAILABLE = 0,
78     LSU_LQUEUE_FULL, // Load Queue unavailable
79     LSU_SQUEUE_FULL  // Store Queue unavailable
80   };
81 
82   /// This method checks the availability of the load/store buffers.
83   ///
84   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
85   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
86   /// not a memory operation.
87   virtual Status isAvailable(const InstRef &IR) const = 0;
88 
89   /// Allocates LS resources for instruction IR.
90   ///
91   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
92   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
93   /// Returns the GroupID associated with this instruction. That value will be
94   /// used to set the LSUTokenID field in class Instruction.
95   virtual unsigned dispatch(const InstRef &IR) = 0;
96 
97   bool isSQEmpty() const { return !UsedSQEntries; }
98   bool isLQEmpty() const { return !UsedLQEntries; }
99   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
100   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
101 
102   /// Check if a peviously dispatched instruction IR is now ready for execution.
103   virtual bool isReady(const InstRef &IR) const = 0;
104 
105   /// Check if instruction IR only depends on memory instructions that are
106   /// currently executing.
107   virtual bool isPending(const InstRef &IR) const = 0;
108 
109   /// Check if instruction IR is still waiting on memory operations, and the
110   /// wait time is still unknown.
111   virtual bool isWaiting(const InstRef &IR) const = 0;
112 
113   virtual bool hasDependentUsers(const InstRef &IR) const = 0;
114 
115   virtual const CriticalDependency getCriticalPredecessor(unsigned GroupId) = 0;
116 
117   virtual void onInstructionExecuted(const InstRef &IR) = 0;
118 
119   // Loads are tracked by the LDQ (load queue) from dispatch until completion.
120   // Stores are tracked by the STQ (store queue) from dispatch until commitment.
121   // By default we conservatively assume that the LDQ receives a load at
122   // dispatch. Loads leave the LDQ at retirement stage.
123   virtual void onInstructionRetired(const InstRef &IR) = 0;
124 
125   virtual void onInstructionIssued(const InstRef &IR) = 0;
126 
127   virtual void cycleEvent() = 0;
128 
129 #ifndef NDEBUG
130   virtual void dump() const = 0;
131 #endif
132 };
133 
134 /// Default Load/Store Unit (LS Unit) for simulated processors.
135 ///
136 /// Each load (or store) consumes one entry in the load (or store) queue.
137 ///
138 /// Rules are:
139 /// 1) A younger load is allowed to pass an older load only if there are no
140 ///    stores nor barriers in between the two loads.
141 /// 2) An younger store is not allowed to pass an older store.
142 /// 3) A younger store is not allowed to pass an older load.
143 /// 4) A younger load is allowed to pass an older store only if the load does
144 ///    not alias with the store.
145 ///
146 /// This class optimistically assumes that loads don't alias store operations.
147 /// Under this assumption, younger loads are always allowed to pass older
148 /// stores (this would only affects rule 4).
149 /// Essentially, this class doesn't perform any sort alias analysis to
150 /// identify aliasing loads and stores.
151 ///
152 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
153 /// set to `false` by the constructor of LSUnit.
154 ///
155 /// Note that this class doesn't know about the existence of different memory
156 /// types for memory operations (example: write-through, write-combining, etc.).
157 /// Derived classes are responsible for implementing that extra knowledge, and
158 /// provide different sets of rules for loads and stores by overriding method
159 /// `isReady()`.
160 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
161 /// derived class to enable the reordering of non-aliasing store operations.
162 ///
163 /// No assumptions are made by this class on the size of the store buffer.  This
164 /// class doesn't know how to identify cases where store-to-load forwarding may
165 /// occur.
166 ///
167 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
168 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
169 /// cache hierarchy and memory types.
170 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
171 /// scheduling model provides an "optimistic" load-to-use latency (which usually
172 /// matches the load-to-use latency for when there is a hit in the L1D).
173 /// Derived classes may expand this knowledge.
174 ///
175 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
176 /// memory-barrier like instructions.
177 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
178 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
179 /// serializes loads without forcing a flush of the load queue.
180 /// Similarly, instructions that both `mayStore` and have `unmodeled side
181 /// effects` are treated like store barriers. A full memory
182 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
183 /// effects. This is obviously inaccurate, but this is the best that we can do
184 /// at the moment.
185 ///
186 /// Each load/store barrier consumes one entry in the load/store queue. A
187 /// load/store barrier enforces ordering of loads/stores:
188 ///  - A younger load cannot pass a load barrier.
189 ///  - A younger store cannot pass a store barrier.
190 ///
191 /// A younger load has to wait for the memory load barrier to execute.
192 /// A load/store barrier is "executed" when it becomes the oldest entry in
193 /// the load/store queue(s). That also means, all the older loads/stores have
194 /// already been executed.
195 class LSUnit : public LSUnitBase {
196 
197   // This class doesn't know about the latency of a load instruction. So, it
198   // conservatively/pessimistically assumes that the latency of a load opcode
199   // matches the instruction latency.
200   //
201   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
202   // and load/store conflicts, the latency of a load is determined by the depth
203   // of the load pipeline. So, we could use field `LoadLatency` in the
204   // MCSchedModel to model that latency.
205   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
206   // L1D, and it usually already accounts for any extra latency due to data
207   // forwarding.
208   // When doing throughput analysis, `LoadLatency` is likely to
209   // be a better predictor of load latency than instruction latency. This is
210   // particularly true when simulating code with temporal/spatial locality of
211   // memory accesses.
212   // Using `LoadLatency` (instead of the instruction latency) is also expected
213   // to improve the load queue allocation for long latency instructions with
214   // folded memory operands (See PR39829).
215   //
216   // FIXME: On some processors, load/store operations are split into multiple
217   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
218   // not 256-bit data types. So, a 256-bit load is effectively split into two
219   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
220   // simplicity, this class optimistically assumes that a load instruction only
221   // consumes one entry in the LoadQueue.  Similarly, store instructions only
222   // consume a single entry in the StoreQueue.
223   // In future, we should reassess the quality of this design, and consider
224   // alternative approaches that let instructions specify the number of
225   // load/store queue entries which they consume at dispatch stage (See
226   // PR39830).
227   //
228   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
229   // conservatively treated as a store barrier. It forces older store to be
230   // executed before newer stores are issued.
231   //
232   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
233   // conservatively treated as a load barrier. It forces older loads to execute
234   // before newer loads are issued.
235 
236 protected:
237   /// A node of a memory dependency graph. A MemoryGroup describes a set of
238   /// instructions with same memory dependencies.
239   ///
240   /// By construction, instructions of a MemoryGroup don't depend on each other.
241   /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
242   /// A Memory group identifier is then stored as a "token" in field
243   /// Instruction::LSUTokenID of each dispatched instructions. That token is
244   /// used internally by the LSUnit to track memory dependencies.
245   class MemoryGroup {
246     unsigned NumPredecessors = 0;
247     unsigned NumExecutingPredecessors = 0;
248     unsigned NumExecutedPredecessors = 0;
249 
250     unsigned NumInstructions = 0;
251     unsigned NumExecuting = 0;
252     unsigned NumExecuted = 0;
253     // Successors that are in a order dependency with this group.
254     SmallVector<MemoryGroup *, 4> OrderSucc;
255     // Successors that are in a data dependency with this group.
256     SmallVector<MemoryGroup *, 4> DataSucc;
257 
258     CriticalDependency CriticalPredecessor;
259     InstRef CriticalMemoryInstruction;
260 
261     MemoryGroup(const MemoryGroup &) = delete;
262     MemoryGroup &operator=(const MemoryGroup &) = delete;
263 
264   public:
265     MemoryGroup() = default;
266     MemoryGroup(MemoryGroup &&) = default;
267 
268     size_t getNumSuccessors() const {
269       return OrderSucc.size() + DataSucc.size();
270     }
271     unsigned getNumPredecessors() const { return NumPredecessors; }
272     unsigned getNumExecutingPredecessors() const {
273       return NumExecutingPredecessors;
274     }
275     unsigned getNumExecutedPredecessors() const {
276       return NumExecutedPredecessors;
277     }
278     unsigned getNumInstructions() const { return NumInstructions; }
279     unsigned getNumExecuting() const { return NumExecuting; }
280     unsigned getNumExecuted() const { return NumExecuted; }
281 
282     const InstRef &getCriticalMemoryInstruction() const {
283       return CriticalMemoryInstruction;
284     }
285     const CriticalDependency &getCriticalPredecessor() const {
286       return CriticalPredecessor;
287     }
288 
289     void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
290       // Do not need to add a dependency if there is no data
291       // dependency and all instructions from this group have been
292       // issued already.
293       if (!IsDataDependent && isExecuting())
294         return;
295 
296       Group->NumPredecessors++;
297       assert(!isExecuted() && "Should have been removed!");
298       if (isExecuting())
299         Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
300 
301       if (IsDataDependent)
302         DataSucc.emplace_back(Group);
303       else
304         OrderSucc.emplace_back(Group);
305     }
306 
307     bool isWaiting() const {
308       return NumPredecessors >
309              (NumExecutingPredecessors + NumExecutedPredecessors);
310     }
311     bool isPending() const {
312       return NumExecutingPredecessors &&
313              ((NumExecutedPredecessors + NumExecutingPredecessors) ==
314               NumPredecessors);
315     }
316     bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
317     bool isExecuting() const {
318       return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
319     }
320     bool isExecuted() const { return NumInstructions == NumExecuted; }
321 
322     void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
323       assert(!isReady() && "Unexpected group-start event!");
324       NumExecutingPredecessors++;
325 
326       if (!ShouldUpdateCriticalDep)
327         return;
328 
329       unsigned Cycles = IR.getInstruction()->getCyclesLeft();
330       if (CriticalPredecessor.Cycles < Cycles) {
331         CriticalPredecessor.IID = IR.getSourceIndex();
332         CriticalPredecessor.Cycles = Cycles;
333       }
334     }
335 
336     void onGroupExecuted() {
337       assert(!isReady() && "Inconsistent state found!");
338       NumExecutingPredecessors--;
339       NumExecutedPredecessors++;
340     }
341 
342     void onInstructionIssued(const InstRef &IR) {
343       assert(!isExecuting() && "Invalid internal state!");
344       ++NumExecuting;
345 
346       // update the CriticalMemDep.
347       const Instruction &IS = *IR.getInstruction();
348       if ((bool)CriticalMemoryInstruction) {
349         const Instruction &OtherIS =
350             *CriticalMemoryInstruction.getInstruction();
351         if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
352           CriticalMemoryInstruction = IR;
353       } else {
354         CriticalMemoryInstruction = IR;
355       }
356 
357       if (!isExecuting())
358         return;
359 
360       // Notify successors that this group started execution.
361       for (MemoryGroup *MG : OrderSucc) {
362         MG->onGroupIssued(CriticalMemoryInstruction, false);
363         // Release the order dependency with this group.
364         MG->onGroupExecuted();
365       }
366 
367       for (MemoryGroup *MG : DataSucc)
368         MG->onGroupIssued(CriticalMemoryInstruction, true);
369     }
370 
371     void onInstructionExecuted(const InstRef &IR) {
372       assert(isReady() && !isExecuted() && "Invalid internal state!");
373       --NumExecuting;
374       ++NumExecuted;
375 
376       if (CriticalMemoryInstruction &&
377           CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
378         CriticalMemoryInstruction.invalidate();
379       }
380 
381       if (!isExecuted())
382         return;
383 
384       // Notify data dependent successors that this group has finished
385       // execution.
386       for (MemoryGroup *MG : DataSucc)
387         MG->onGroupExecuted();
388     }
389 
390     void addInstruction() {
391       assert(!getNumSuccessors() && "Cannot add instructions to this group!");
392       ++NumInstructions;
393     }
394 
395     void cycleEvent() {
396       if (isWaiting() && CriticalPredecessor.Cycles)
397         CriticalPredecessor.Cycles--;
398     }
399   };
400   /// Used to map group identifiers to MemoryGroups.
401   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
402   unsigned NextGroupID = 1;
403 
404   unsigned CurrentLoadGroupID;
405   unsigned CurrentLoadBarrierGroupID;
406   unsigned CurrentStoreGroupID;
407   unsigned CurrentStoreBarrierGroupID;
408 
409 public:
410   LSUnit(const MCSchedModel &SM)
411       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
412   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
413       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
414   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
415       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
416         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
417         CurrentStoreBarrierGroupID(0) {}
418 
419   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
420   /// accomodate instruction IR.
421   Status isAvailable(const InstRef &IR) const override;
422 
423   bool isReady(const InstRef &IR) const override {
424     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
425     const MemoryGroup &Group = getGroup(GroupID);
426     return Group.isReady();
427   }
428 
429   bool isPending(const InstRef &IR) const override {
430     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
431     const MemoryGroup &Group = getGroup(GroupID);
432     return Group.isPending();
433   }
434 
435   bool isWaiting(const InstRef &IR) const override {
436     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
437     const MemoryGroup &Group = getGroup(GroupID);
438     return Group.isWaiting();
439   }
440 
441   bool hasDependentUsers(const InstRef &IR) const override {
442     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
443     const MemoryGroup &Group = getGroup(GroupID);
444     return !Group.isExecuted() && Group.getNumSuccessors();
445   }
446 
447   const CriticalDependency getCriticalPredecessor(unsigned GroupId) override {
448     const MemoryGroup &Group = getGroup(GroupId);
449     return Group.getCriticalPredecessor();
450   }
451 
452   /// Allocates LS resources for instruction IR.
453   ///
454   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
455   /// returning LSU_AVAILABLE.
456   ///
457   /// Rules are:
458   /// By default, rules are:
459   /// 1. A store may not pass a previous store.
460   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
461   /// 3. A load may pass a previous load.
462   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
463   /// 5. A load has to wait until an older load barrier is fully executed.
464   /// 6. A store has to wait until an older store barrier is fully executed.
465   unsigned dispatch(const InstRef &IR) override;
466 
467   virtual void onInstructionIssued(const InstRef &IR) override {
468     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
469     Groups[GroupID]->onInstructionIssued(IR);
470   }
471 
472   virtual void onInstructionRetired(const InstRef &IR) override;
473 
474   virtual void onInstructionExecuted(const InstRef &IR) override;
475 
476   virtual void cycleEvent() override;
477 
478 #ifndef NDEBUG
479   virtual void dump() const override;
480 #endif
481 
482 private:
483   bool isValidGroupID(unsigned Index) const {
484     return Index && Groups.contains(Index);
485   }
486 
487   const MemoryGroup &getGroup(unsigned Index) const {
488     assert(isValidGroupID(Index) && "Group doesn't exist!");
489     return *Groups.find(Index)->second;
490   }
491 
492   MemoryGroup &getGroup(unsigned Index) {
493     assert(isValidGroupID(Index) && "Group doesn't exist!");
494     return *Groups.find(Index)->second;
495   }
496 
497   unsigned createMemoryGroup() {
498     Groups.insert(std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
499     return NextGroupID++;
500   }
501 };
502 
503 } // namespace mca
504 } // namespace llvm
505 
506 #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
507