xref: /llvm-project/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp (revision 24b7b99b7d68e1a234078eff639ccfbb7661eee5)
1 //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ParallelSnippetGenerator.h"
10 
11 #include "BenchmarkRunner.h"
12 #include "MCInstrDescView.h"
13 #include "Target.h"
14 
15 // FIXME: Load constants into registers (e.g. with fld1) to not break
16 // instructions like x87.
17 
18 // Ideally we would like the only limitation on executing instructions to be the
19 // availability of the CPU resources (e.g. execution ports) needed to execute
20 // them, instead of the availability of their data dependencies.
21 
22 // To achieve that, one approach is to generate instructions that do not have
23 // data dependencies between them.
24 //
25 // For some instructions, this is trivial:
26 //    mov rax, qword ptr [rsi]
27 //    mov rax, qword ptr [rsi]
28 //    mov rax, qword ptr [rsi]
29 //    mov rax, qword ptr [rsi]
30 // For the above snippet, haswell just renames rax four times and executes the
31 // four instructions two at a time on P23 and P0126.
32 //
33 // For some instructions, we just need to make sure that the source is
34 // different from the destination. For example, IDIV8r reads from GPR and
35 // writes to AX. We just need to ensure that the Var is assigned a
36 // register which is different from AX:
37 //    idiv bx
38 //    idiv bx
39 //    idiv bx
40 //    idiv bx
41 // The above snippet will be able to fully saturate the ports, while the same
42 // with ax would issue one uop every `latency(IDIV8r)` cycles.
43 //
44 // Some instructions make this harder because they both read and write from
45 // the same register:
46 //    inc rax
47 //    inc rax
48 //    inc rax
49 //    inc rax
50 // This has a data dependency from each instruction to the next, limit the
51 // number of instructions that can be issued in parallel.
52 // It turns out that this is not a big issue on recent Intel CPUs because they
53 // have heuristics to balance port pressure. In the snippet above, subsequent
54 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
55 // might end up executing them all on P0 (just because they can), or try
56 // avoiding P5 because it's usually under high pressure from vector
57 // instructions.
58 // This issue is even more important for high-latency instructions because
59 // they increase the idle time of the CPU, e.g. :
60 //    imul rax, rbx
61 //    imul rax, rbx
62 //    imul rax, rbx
63 //    imul rax, rbx
64 //
65 // To avoid that, we do the renaming statically by generating as many
66 // independent exclusive assignments as possible (until all possible registers
67 // are exhausted) e.g.:
68 //    imul rax, rbx
69 //    imul rcx, rbx
70 //    imul rdx, rbx
71 //    imul r8,  rbx
72 //
73 // Some instruction even make the above static renaming impossible because
74 // they implicitly read and write from the same operand, e.g. ADC16rr reads
75 // and writes from EFLAGS.
76 // In that case we just use a greedy register assignment and hope for the
77 // best.
78 
79 namespace llvm {
80 namespace exegesis {
81 
82 static SmallVector<const Variable *, 8>
83 getVariablesWithTiedOperands(const Instruction &Instr) {
84   SmallVector<const Variable *, 8> Result;
85   for (const auto &Var : Instr.Variables)
86     if (Var.hasTiedOperands())
87       Result.push_back(&Var);
88   return Result;
89 }
90 
91 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
92 
93 void ParallelSnippetGenerator::instantiateMemoryOperands(
94     const unsigned ScratchSpacePointerInReg,
95     std::vector<InstructionTemplate> &Instructions) const {
96   if (ScratchSpacePointerInReg == 0)
97     return; // no memory operands.
98   const auto &ET = State.getExegesisTarget();
99   const unsigned MemStep = ET.getMaxMemoryAccessSize();
100   const size_t OriginalInstructionsSize = Instructions.size();
101   size_t I = 0;
102   for (InstructionTemplate &IT : Instructions) {
103     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
104     ++I;
105   }
106 
107   while (Instructions.size() < kMinNumDifferentAddresses) {
108     InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
109     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
110     ++I;
111     Instructions.push_back(std::move(IT));
112   }
113   assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
114          "not enough scratch space");
115 }
116 
117 static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(
118     const LLVMState &State, const InstructionTemplate &IT,
119     const ArrayRef<const Variable *> TiedVariables,
120     const BitVector &ForbiddenRegisters) {
121   std::vector<InstructionTemplate> Instructions;
122   // Assign registers to variables in a round-robin manner. This is simple but
123   // ensures that the most register-constrained variable does not get starved.
124   std::vector<BitVector> PossibleRegsForVar;
125   for (const Variable *Var : TiedVariables) {
126     assert(Var);
127     const Operand &Op = IT.getInstr().getPrimaryOperand(*Var);
128     assert(Op.isReg());
129     BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits();
130     remove(PossibleRegs, ForbiddenRegisters);
131     PossibleRegsForVar.push_back(std::move(PossibleRegs));
132   }
133   SmallVector<int, 2> Iterators(TiedVariables.size(), 0);
134   while (true) {
135     InstructionTemplate TmpIT = IT;
136     // Find a possible register for each variable in turn, marking the
137     // register as taken.
138     for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {
139       const int NextPossibleReg =
140           PossibleRegsForVar[VarId].find_next(Iterators[VarId]);
141       if (NextPossibleReg <= 0) {
142         return Instructions;
143       }
144       TmpIT.getValueFor(*TiedVariables[VarId]) =
145           MCOperand::createReg(NextPossibleReg);
146       // Bump iterator.
147       Iterators[VarId] = NextPossibleReg;
148       // Prevent other variables from using the register.
149       for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {
150         OtherPossibleRegs.reset(NextPossibleReg);
151       }
152     }
153     Instructions.push_back(std::move(TmpIT));
154   }
155 }
156 
157 Expected<std::vector<CodeTemplate>> ParallelSnippetGenerator::generateCodeTemplates(
158     const Instruction &Instr, const BitVector &ForbiddenRegisters) const {
159   CodeTemplate CT;
160   CT.ScratchSpacePointerInReg =
161       Instr.hasMemoryOperands()
162           ? State.getExegesisTarget().getScratchMemoryRegister(
163                 State.getTargetMachine().getTargetTriple())
164           : 0;
165   const AliasingConfigurations SelfAliasing(Instr, Instr);
166   InstructionTemplate IT(&Instr);
167   if (SelfAliasing.empty()) {
168     CT.Info = "instruction is parallel, repeating a random one.";
169     CT.Instructions.push_back(std::move(IT));
170     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
171     return getSingleton(std::move(CT));
172   }
173   if (SelfAliasing.hasImplicitAliasing()) {
174     CT.Info = "instruction is serial, repeating a random one.";
175     CT.Instructions.push_back(std::move(IT));
176     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
177     return getSingleton(std::move(CT));
178   }
179   const auto TiedVariables = getVariablesWithTiedOperands(Instr);
180   if (!TiedVariables.empty()) {
181     CT.Info = "instruction has tied variables, using static renaming.";
182     CT.Instructions = generateSnippetUsingStaticRenaming(
183         State, IT, TiedVariables, ForbiddenRegisters);
184     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
185     return getSingleton(std::move(CT));
186   }
187   // No tied variables, we pick random values for defs.
188   BitVector Defs(State.getRegInfo().getNumRegs());
189   for (const auto &Op : Instr.Operands) {
190     if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
191       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
192       // Do not use forbidden registers.
193       remove(PossibleRegisters, ForbiddenRegisters);
194       assert(PossibleRegisters.any() && "No register left to choose from");
195       const auto RandomReg = randomBit(PossibleRegisters);
196       Defs.set(RandomReg);
197       IT.getValueFor(Op) = MCOperand::createReg(RandomReg);
198     }
199   }
200   // And pick random use values that are not reserved and don't alias with defs.
201   const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
202   for (const auto &Op : Instr.Operands) {
203     if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
204       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
205       remove(PossibleRegisters, ForbiddenRegisters);
206       remove(PossibleRegisters, DefAliases);
207       assert(PossibleRegisters.any() && "No register left to choose from");
208       const auto RandomReg = randomBit(PossibleRegisters);
209       IT.getValueFor(Op) = MCOperand::createReg(RandomReg);
210     }
211   }
212   CT.Info =
213       "instruction has no tied variables picking Uses different from defs";
214   CT.Instructions.push_back(std::move(IT));
215   instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
216   return getSingleton(std::move(CT));
217 }
218 
219 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
220 
221 } // namespace exegesis
222 } // namespace llvm
223