1 //===------------------------- LSUnit.h --------------------------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// A Load/Store unit class that models load/store queues and that implements 11 /// a simple weak memory consistency model. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H 16 #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H 17 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/MC/MCSchedule.h" 21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h" 22 #include "llvm/MCA/Instruction.h" 23 24 namespace llvm { 25 namespace mca { 26 27 /// Abstract base interface for LS (load/store) units in llvm-mca. 28 class LSUnitBase : public HardwareUnit { 29 /// Load queue size. 30 /// 31 /// A value of zero for this field means that the load queue is unbounded. 32 /// Processor models can declare the size of a load queue via tablegen (see 33 /// the definition of tablegen class LoadQueue in 34 /// llvm/Target/TargetSchedule.td). 35 unsigned LQSize; 36 37 /// Load queue size. 38 /// 39 /// A value of zero for this field means that the store queue is unbounded. 40 /// Processor models can declare the size of a store queue via tablegen (see 41 /// the definition of tablegen class StoreQueue in 42 /// llvm/Target/TargetSchedule.td). 43 unsigned SQSize; 44 45 unsigned UsedLQEntries; 46 unsigned UsedSQEntries; 47 48 /// True if loads don't alias with stores. 49 /// 50 /// By default, the LS unit assumes that loads and stores don't alias with 51 /// each other. If this field is set to false, then loads are always assumed 52 /// to alias with stores. 53 const bool NoAlias; 54 55 public: 56 LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize, 57 unsigned StoreQueueSize, bool AssumeNoAlias); 58 59 virtual ~LSUnitBase(); 60 61 /// Returns the total number of entries in the load queue. 62 unsigned getLoadQueueSize() const { return LQSize; } 63 64 /// Returns the total number of entries in the store queue. 65 unsigned getStoreQueueSize() const { return SQSize; } 66 67 unsigned getUsedLQEntries() const { return UsedLQEntries; } 68 unsigned getUsedSQEntries() const { return UsedSQEntries; } 69 void acquireLQSlot() { ++UsedLQEntries; } 70 void acquireSQSlot() { ++UsedSQEntries; } 71 void releaseLQSlot() { --UsedLQEntries; } 72 void releaseSQSlot() { --UsedSQEntries; } 73 74 bool assumeNoAlias() const { return NoAlias; } 75 76 enum Status { 77 LSU_AVAILABLE = 0, 78 LSU_LQUEUE_FULL, // Load Queue unavailable 79 LSU_SQUEUE_FULL // Store Queue unavailable 80 }; 81 82 /// This method checks the availability of the load/store buffers. 83 /// 84 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to 85 /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is 86 /// not a memory operation. 87 virtual Status isAvailable(const InstRef &IR) const = 0; 88 89 /// Allocates LS resources for instruction IR. 90 /// 91 /// This method assumes that a previous call to `isAvailable(IR)` succeeded 92 /// with a LSUnitBase::Status value of LSU_AVAILABLE. 93 /// Returns the GroupID associated with this instruction. That value will be 94 /// used to set the LSUTokenID field in class Instruction. 95 virtual unsigned dispatch(const InstRef &IR) = 0; 96 97 bool isSQEmpty() const { return !UsedSQEntries; } 98 bool isLQEmpty() const { return !UsedLQEntries; } 99 bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; } 100 bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; } 101 102 /// Check if a peviously dispatched instruction IR is now ready for execution. 103 virtual bool isReady(const InstRef &IR) const = 0; 104 105 /// Check if instruction IR only depends on memory instructions that are 106 /// currently executing. 107 virtual bool isPending(const InstRef &IR) const = 0; 108 109 /// Check if instruction IR is still waiting on memory operations, and the 110 /// wait time is still unknown. 111 virtual bool isWaiting(const InstRef &IR) const = 0; 112 113 virtual bool hasDependentUsers(const InstRef &IR) const = 0; 114 115 virtual const CriticalDependency getCriticalPredecessor(unsigned GroupId) = 0; 116 117 virtual void onInstructionExecuted(const InstRef &IR) = 0; 118 119 // Loads are tracked by the LDQ (load queue) from dispatch until completion. 120 // Stores are tracked by the STQ (store queue) from dispatch until commitment. 121 // By default we conservatively assume that the LDQ receives a load at 122 // dispatch. Loads leave the LDQ at retirement stage. 123 virtual void onInstructionRetired(const InstRef &IR) = 0; 124 125 virtual void onInstructionIssued(const InstRef &IR) = 0; 126 127 virtual void cycleEvent() = 0; 128 129 #ifndef NDEBUG 130 virtual void dump() const = 0; 131 #endif 132 }; 133 134 /// Default Load/Store Unit (LS Unit) for simulated processors. 135 /// 136 /// Each load (or store) consumes one entry in the load (or store) queue. 137 /// 138 /// Rules are: 139 /// 1) A younger load is allowed to pass an older load only if there are no 140 /// stores nor barriers in between the two loads. 141 /// 2) An younger store is not allowed to pass an older store. 142 /// 3) A younger store is not allowed to pass an older load. 143 /// 4) A younger load is allowed to pass an older store only if the load does 144 /// not alias with the store. 145 /// 146 /// This class optimistically assumes that loads don't alias store operations. 147 /// Under this assumption, younger loads are always allowed to pass older 148 /// stores (this would only affects rule 4). 149 /// Essentially, this class doesn't perform any sort alias analysis to 150 /// identify aliasing loads and stores. 151 /// 152 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be 153 /// set to `false` by the constructor of LSUnit. 154 /// 155 /// Note that this class doesn't know about the existence of different memory 156 /// types for memory operations (example: write-through, write-combining, etc.). 157 /// Derived classes are responsible for implementing that extra knowledge, and 158 /// provide different sets of rules for loads and stores by overriding method 159 /// `isReady()`. 160 /// To emulate a write-combining memory type, rule 2. must be relaxed in a 161 /// derived class to enable the reordering of non-aliasing store operations. 162 /// 163 /// No assumptions are made by this class on the size of the store buffer. This 164 /// class doesn't know how to identify cases where store-to-load forwarding may 165 /// occur. 166 /// 167 /// LSUnit doesn't attempt to predict whether a load or store hits or misses 168 /// the L1 cache. To be more specific, LSUnit doesn't know anything about 169 /// cache hierarchy and memory types. 170 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the 171 /// scheduling model provides an "optimistic" load-to-use latency (which usually 172 /// matches the load-to-use latency for when there is a hit in the L1D). 173 /// Derived classes may expand this knowledge. 174 /// 175 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor 176 /// memory-barrier like instructions. 177 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has 178 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it 179 /// serializes loads without forcing a flush of the load queue. 180 /// Similarly, instructions that both `mayStore` and have `unmodeled side 181 /// effects` are treated like store barriers. A full memory 182 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side 183 /// effects. This is obviously inaccurate, but this is the best that we can do 184 /// at the moment. 185 /// 186 /// Each load/store barrier consumes one entry in the load/store queue. A 187 /// load/store barrier enforces ordering of loads/stores: 188 /// - A younger load cannot pass a load barrier. 189 /// - A younger store cannot pass a store barrier. 190 /// 191 /// A younger load has to wait for the memory load barrier to execute. 192 /// A load/store barrier is "executed" when it becomes the oldest entry in 193 /// the load/store queue(s). That also means, all the older loads/stores have 194 /// already been executed. 195 class LSUnit : public LSUnitBase { 196 197 // This class doesn't know about the latency of a load instruction. So, it 198 // conservatively/pessimistically assumes that the latency of a load opcode 199 // matches the instruction latency. 200 // 201 // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses), 202 // and load/store conflicts, the latency of a load is determined by the depth 203 // of the load pipeline. So, we could use field `LoadLatency` in the 204 // MCSchedModel to model that latency. 205 // Field `LoadLatency` often matches the so-called 'load-to-use' latency from 206 // L1D, and it usually already accounts for any extra latency due to data 207 // forwarding. 208 // When doing throughput analysis, `LoadLatency` is likely to 209 // be a better predictor of load latency than instruction latency. This is 210 // particularly true when simulating code with temporal/spatial locality of 211 // memory accesses. 212 // Using `LoadLatency` (instead of the instruction latency) is also expected 213 // to improve the load queue allocation for long latency instructions with 214 // folded memory operands (See PR39829). 215 // 216 // FIXME: On some processors, load/store operations are split into multiple 217 // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but 218 // not 256-bit data types. So, a 256-bit load is effectively split into two 219 // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For 220 // simplicity, this class optimistically assumes that a load instruction only 221 // consumes one entry in the LoadQueue. Similarly, store instructions only 222 // consume a single entry in the StoreQueue. 223 // In future, we should reassess the quality of this design, and consider 224 // alternative approaches that let instructions specify the number of 225 // load/store queue entries which they consume at dispatch stage (See 226 // PR39830). 227 // 228 // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is 229 // conservatively treated as a store barrier. It forces older store to be 230 // executed before newer stores are issued. 231 // 232 // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is 233 // conservatively treated as a load barrier. It forces older loads to execute 234 // before newer loads are issued. 235 236 protected: 237 /// A node of a memory dependency graph. A MemoryGroup describes a set of 238 /// instructions with same memory dependencies. 239 /// 240 /// By construction, instructions of a MemoryGroup don't depend on each other. 241 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups. 242 /// A Memory group identifier is then stored as a "token" in field 243 /// Instruction::LSUTokenID of each dispatched instructions. That token is 244 /// used internally by the LSUnit to track memory dependencies. 245 class MemoryGroup { 246 unsigned NumPredecessors = 0; 247 unsigned NumExecutingPredecessors = 0; 248 unsigned NumExecutedPredecessors = 0; 249 250 unsigned NumInstructions = 0; 251 unsigned NumExecuting = 0; 252 unsigned NumExecuted = 0; 253 // Successors that are in a order dependency with this group. 254 SmallVector<MemoryGroup *, 4> OrderSucc; 255 // Successors that are in a data dependency with this group. 256 SmallVector<MemoryGroup *, 4> DataSucc; 257 258 CriticalDependency CriticalPredecessor; 259 InstRef CriticalMemoryInstruction; 260 261 MemoryGroup(const MemoryGroup &) = delete; 262 MemoryGroup &operator=(const MemoryGroup &) = delete; 263 264 public: 265 MemoryGroup() = default; 266 MemoryGroup(MemoryGroup &&) = default; 267 268 size_t getNumSuccessors() const { 269 return OrderSucc.size() + DataSucc.size(); 270 } 271 unsigned getNumPredecessors() const { return NumPredecessors; } 272 unsigned getNumExecutingPredecessors() const { 273 return NumExecutingPredecessors; 274 } 275 unsigned getNumExecutedPredecessors() const { 276 return NumExecutedPredecessors; 277 } 278 unsigned getNumInstructions() const { return NumInstructions; } 279 unsigned getNumExecuting() const { return NumExecuting; } 280 unsigned getNumExecuted() const { return NumExecuted; } 281 282 const InstRef &getCriticalMemoryInstruction() const { 283 return CriticalMemoryInstruction; 284 } 285 const CriticalDependency &getCriticalPredecessor() const { 286 return CriticalPredecessor; 287 } 288 289 void addSuccessor(MemoryGroup *Group, bool IsDataDependent) { 290 // Do not need to add a dependency if there is no data 291 // dependency and all instructions from this group have been 292 // issued already. 293 if (!IsDataDependent && isExecuting()) 294 return; 295 296 Group->NumPredecessors++; 297 assert(!isExecuted() && "Should have been removed!"); 298 if (isExecuting()) 299 Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent); 300 301 if (IsDataDependent) 302 DataSucc.emplace_back(Group); 303 else 304 OrderSucc.emplace_back(Group); 305 } 306 307 bool isWaiting() const { 308 return NumPredecessors > 309 (NumExecutingPredecessors + NumExecutedPredecessors); 310 } 311 bool isPending() const { 312 return NumExecutingPredecessors && 313 ((NumExecutedPredecessors + NumExecutingPredecessors) == 314 NumPredecessors); 315 } 316 bool isReady() const { return NumExecutedPredecessors == NumPredecessors; } 317 bool isExecuting() const { 318 return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted)); 319 } 320 bool isExecuted() const { return NumInstructions == NumExecuted; } 321 322 void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) { 323 assert(!isReady() && "Unexpected group-start event!"); 324 NumExecutingPredecessors++; 325 326 if (!ShouldUpdateCriticalDep) 327 return; 328 329 unsigned Cycles = IR.getInstruction()->getCyclesLeft(); 330 if (CriticalPredecessor.Cycles < Cycles) { 331 CriticalPredecessor.IID = IR.getSourceIndex(); 332 CriticalPredecessor.Cycles = Cycles; 333 } 334 } 335 336 void onGroupExecuted() { 337 assert(!isReady() && "Inconsistent state found!"); 338 NumExecutingPredecessors--; 339 NumExecutedPredecessors++; 340 } 341 342 void onInstructionIssued(const InstRef &IR) { 343 assert(!isExecuting() && "Invalid internal state!"); 344 ++NumExecuting; 345 346 // update the CriticalMemDep. 347 const Instruction &IS = *IR.getInstruction(); 348 if ((bool)CriticalMemoryInstruction) { 349 const Instruction &OtherIS = 350 *CriticalMemoryInstruction.getInstruction(); 351 if (OtherIS.getCyclesLeft() < IS.getCyclesLeft()) 352 CriticalMemoryInstruction = IR; 353 } else { 354 CriticalMemoryInstruction = IR; 355 } 356 357 if (!isExecuting()) 358 return; 359 360 // Notify successors that this group started execution. 361 for (MemoryGroup *MG : OrderSucc) { 362 MG->onGroupIssued(CriticalMemoryInstruction, false); 363 // Release the order dependency with this group. 364 MG->onGroupExecuted(); 365 } 366 367 for (MemoryGroup *MG : DataSucc) 368 MG->onGroupIssued(CriticalMemoryInstruction, true); 369 } 370 371 void onInstructionExecuted(const InstRef &IR) { 372 assert(isReady() && !isExecuted() && "Invalid internal state!"); 373 --NumExecuting; 374 ++NumExecuted; 375 376 if (CriticalMemoryInstruction && 377 CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) { 378 CriticalMemoryInstruction.invalidate(); 379 } 380 381 if (!isExecuted()) 382 return; 383 384 // Notify data dependent successors that this group has finished 385 // execution. 386 for (MemoryGroup *MG : DataSucc) 387 MG->onGroupExecuted(); 388 } 389 390 void addInstruction() { 391 assert(!getNumSuccessors() && "Cannot add instructions to this group!"); 392 ++NumInstructions; 393 } 394 395 void cycleEvent() { 396 if (isWaiting() && CriticalPredecessor.Cycles) 397 CriticalPredecessor.Cycles--; 398 } 399 }; 400 /// Used to map group identifiers to MemoryGroups. 401 DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups; 402 unsigned NextGroupID = 1; 403 404 unsigned CurrentLoadGroupID; 405 unsigned CurrentLoadBarrierGroupID; 406 unsigned CurrentStoreGroupID; 407 unsigned CurrentStoreBarrierGroupID; 408 409 public: 410 LSUnit(const MCSchedModel &SM) 411 : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {} 412 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ) 413 : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {} 414 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias) 415 : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0), 416 CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0), 417 CurrentStoreBarrierGroupID(0) {} 418 419 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to 420 /// accomodate instruction IR. 421 Status isAvailable(const InstRef &IR) const override; 422 423 bool isReady(const InstRef &IR) const override { 424 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 425 const MemoryGroup &Group = getGroup(GroupID); 426 return Group.isReady(); 427 } 428 429 bool isPending(const InstRef &IR) const override { 430 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 431 const MemoryGroup &Group = getGroup(GroupID); 432 return Group.isPending(); 433 } 434 435 bool isWaiting(const InstRef &IR) const override { 436 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 437 const MemoryGroup &Group = getGroup(GroupID); 438 return Group.isWaiting(); 439 } 440 441 bool hasDependentUsers(const InstRef &IR) const override { 442 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 443 const MemoryGroup &Group = getGroup(GroupID); 444 return !Group.isExecuted() && Group.getNumSuccessors(); 445 } 446 447 const CriticalDependency getCriticalPredecessor(unsigned GroupId) override { 448 const MemoryGroup &Group = getGroup(GroupId); 449 return Group.getCriticalPredecessor(); 450 } 451 452 /// Allocates LS resources for instruction IR. 453 /// 454 /// This method assumes that a previous call to `isAvailable(IR)` succeeded 455 /// returning LSU_AVAILABLE. 456 /// 457 /// Rules are: 458 /// By default, rules are: 459 /// 1. A store may not pass a previous store. 460 /// 2. A load may not pass a previous store unless flag 'NoAlias' is set. 461 /// 3. A load may pass a previous load. 462 /// 4. A store may not pass a previous load (regardless of flag 'NoAlias'). 463 /// 5. A load has to wait until an older load barrier is fully executed. 464 /// 6. A store has to wait until an older store barrier is fully executed. 465 unsigned dispatch(const InstRef &IR) override; 466 467 virtual void onInstructionIssued(const InstRef &IR) override { 468 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 469 Groups[GroupID]->onInstructionIssued(IR); 470 } 471 472 virtual void onInstructionRetired(const InstRef &IR) override; 473 474 virtual void onInstructionExecuted(const InstRef &IR) override; 475 476 virtual void cycleEvent() override; 477 478 #ifndef NDEBUG 479 virtual void dump() const override; 480 #endif 481 482 private: 483 bool isValidGroupID(unsigned Index) const { 484 return Index && Groups.contains(Index); 485 } 486 487 const MemoryGroup &getGroup(unsigned Index) const { 488 assert(isValidGroupID(Index) && "Group doesn't exist!"); 489 return *Groups.find(Index)->second; 490 } 491 492 MemoryGroup &getGroup(unsigned Index) { 493 assert(isValidGroupID(Index) && "Group doesn't exist!"); 494 return *Groups.find(Index)->second; 495 } 496 497 unsigned createMemoryGroup() { 498 Groups.insert(std::make_pair(NextGroupID, std::make_unique<MemoryGroup>())); 499 return NextGroupID++; 500 } 501 }; 502 503 } // namespace mca 504 } // namespace llvm 505 506 #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H 507