xref: /llvm-project/llvm/tools/llvm-exegesis/lib/X86/Target.cpp (revision ff1b01bb7897bf2401540096af775d35b12eb247)
1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "../Target.h"
9 
10 #include "../Error.h"
11 #include "../MmapUtils.h"
12 #include "../ParallelSnippetGenerator.h"
13 #include "../SerialSnippetGenerator.h"
14 #include "../SnippetGenerator.h"
15 #include "../SubprocessMemory.h"
16 #include "MCTargetDesc/X86BaseInfo.h"
17 #include "MCTargetDesc/X86MCTargetDesc.h"
18 #include "X86.h"
19 #include "X86Counter.h"
20 #include "X86RegisterInfo.h"
21 #include "llvm/ADT/Sequence.h"
22 #include "llvm/CodeGen/MachineInstrBuilder.h"
23 #include "llvm/MC/MCInstBuilder.h"
24 #include "llvm/Support/Errc.h"
25 #include "llvm/Support/Error.h"
26 #include "llvm/Support/ErrorHandling.h"
27 #include "llvm/Support/FormatVariadic.h"
28 #include "llvm/TargetParser/Host.h"
29 
30 #include <memory>
31 #include <string>
32 #include <vector>
33 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
34 #include <immintrin.h>
35 #include <intrin.h>
36 #endif
37 #if defined(_MSC_VER) && defined(_M_X64)
38 #include <float.h> // For _clearfp in ~X86SavedState().
39 #endif
40 
41 #ifdef __linux__
42 #ifdef __x86_64__
43 #include <asm/prctl.h>
44 #endif // __x86_64__
45 #include <sys/mman.h>
46 #include <sys/syscall.h>
47 #include <unistd.h>
48 #ifdef HAVE_LIBPFM
49 #include <perfmon/perf_event.h>
50 #endif // HAVE_LIBPFM
51 #endif
52 
53 #define GET_AVAILABLE_OPCODE_CHECKER
54 #include "X86GenInstrInfo.inc"
55 
56 namespace llvm {
57 namespace exegesis {
58 
59 // If a positive value is specified, we are going to use the LBR in
60 // latency-mode.
61 //
62 // Note:
63 //  -  A small value is preferred, but too low a value could result in
64 //     throttling.
65 //  -  A prime number is preferred to avoid always skipping certain blocks.
66 //
67 static cl::opt<unsigned> LbrSamplingPeriod(
68     "x86-lbr-sample-period",
69     cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
70     cl::cat(BenchmarkOptions), cl::init(0));
71 
72 static cl::opt<bool>
73     DisableUpperSSERegisters("x86-disable-upper-sse-registers",
74                              cl::desc("Disable XMM8-XMM15 register usage"),
75                              cl::cat(BenchmarkOptions), cl::init(false));
76 
77 // FIXME: Validates that repetition-mode is loop if LBR is requested.
78 
79 // Returns a non-null reason if we cannot handle the memory references in this
80 // instruction.
81 static const char *isInvalidMemoryInstr(const Instruction &Instr) {
82   switch (Instr.Description.TSFlags & X86II::FormMask) {
83   default:
84     return "Unknown FormMask value";
85   // These have no memory access.
86   case X86II::Pseudo:
87   case X86II::RawFrm:
88   case X86II::AddCCFrm:
89   case X86II::PrefixByte:
90   case X86II::MRMDestReg:
91   case X86II::MRMSrcReg:
92   case X86II::MRMSrcReg4VOp3:
93   case X86II::MRMSrcRegOp4:
94   case X86II::MRMSrcRegCC:
95   case X86II::MRMXrCC:
96   case X86II::MRMr0:
97   case X86II::MRMXr:
98   case X86II::MRM0r:
99   case X86II::MRM1r:
100   case X86II::MRM2r:
101   case X86II::MRM3r:
102   case X86II::MRM4r:
103   case X86II::MRM5r:
104   case X86II::MRM6r:
105   case X86II::MRM7r:
106   case X86II::MRM0X:
107   case X86II::MRM1X:
108   case X86II::MRM2X:
109   case X86II::MRM3X:
110   case X86II::MRM4X:
111   case X86II::MRM5X:
112   case X86II::MRM6X:
113   case X86II::MRM7X:
114   case X86II::MRM_C0:
115   case X86II::MRM_C1:
116   case X86II::MRM_C2:
117   case X86II::MRM_C3:
118   case X86II::MRM_C4:
119   case X86II::MRM_C5:
120   case X86II::MRM_C6:
121   case X86II::MRM_C7:
122   case X86II::MRM_C8:
123   case X86II::MRM_C9:
124   case X86II::MRM_CA:
125   case X86II::MRM_CB:
126   case X86II::MRM_CC:
127   case X86II::MRM_CD:
128   case X86II::MRM_CE:
129   case X86II::MRM_CF:
130   case X86II::MRM_D0:
131   case X86II::MRM_D1:
132   case X86II::MRM_D2:
133   case X86II::MRM_D3:
134   case X86II::MRM_D4:
135   case X86II::MRM_D5:
136   case X86II::MRM_D6:
137   case X86II::MRM_D7:
138   case X86II::MRM_D8:
139   case X86II::MRM_D9:
140   case X86II::MRM_DA:
141   case X86II::MRM_DB:
142   case X86II::MRM_DC:
143   case X86II::MRM_DD:
144   case X86II::MRM_DE:
145   case X86II::MRM_DF:
146   case X86II::MRM_E0:
147   case X86II::MRM_E1:
148   case X86II::MRM_E2:
149   case X86II::MRM_E3:
150   case X86II::MRM_E4:
151   case X86II::MRM_E5:
152   case X86II::MRM_E6:
153   case X86II::MRM_E7:
154   case X86II::MRM_E8:
155   case X86II::MRM_E9:
156   case X86II::MRM_EA:
157   case X86II::MRM_EB:
158   case X86II::MRM_EC:
159   case X86II::MRM_ED:
160   case X86II::MRM_EE:
161   case X86II::MRM_EF:
162   case X86II::MRM_F0:
163   case X86II::MRM_F1:
164   case X86II::MRM_F2:
165   case X86II::MRM_F3:
166   case X86II::MRM_F4:
167   case X86II::MRM_F5:
168   case X86II::MRM_F6:
169   case X86II::MRM_F7:
170   case X86II::MRM_F8:
171   case X86II::MRM_F9:
172   case X86II::MRM_FA:
173   case X86II::MRM_FB:
174   case X86II::MRM_FC:
175   case X86II::MRM_FD:
176   case X86II::MRM_FE:
177   case X86II::MRM_FF:
178   case X86II::RawFrmImm8:
179     return nullptr;
180   case X86II::AddRegFrm:
181     return (Instr.Description.Opcode == X86::POP16r ||
182             Instr.Description.Opcode == X86::POP32r ||
183             Instr.Description.Opcode == X86::PUSH16r ||
184             Instr.Description.Opcode == X86::PUSH32r)
185                ? "unsupported opcode: unsupported memory access"
186                : nullptr;
187   // These access memory and are handled.
188   case X86II::MRMDestMem:
189   case X86II::MRMSrcMem:
190   case X86II::MRMSrcMem4VOp3:
191   case X86II::MRMSrcMemOp4:
192   case X86II::MRMSrcMemCC:
193   case X86II::MRMXmCC:
194   case X86II::MRMXm:
195   case X86II::MRM0m:
196   case X86II::MRM1m:
197   case X86II::MRM2m:
198   case X86II::MRM3m:
199   case X86II::MRM4m:
200   case X86II::MRM5m:
201   case X86II::MRM6m:
202   case X86II::MRM7m:
203     return nullptr;
204   // These access memory and are not handled yet.
205   case X86II::RawFrmImm16:
206   case X86II::RawFrmMemOffs:
207   case X86II::RawFrmSrc:
208   case X86II::RawFrmDst:
209   case X86II::RawFrmDstSrc:
210     return "unsupported opcode: non uniform memory access";
211   }
212 }
213 
214 // If the opcode is invalid, returns a pointer to a character literal indicating
215 // the reason. nullptr indicates a valid opcode.
216 static const char *isInvalidOpcode(const Instruction &Instr) {
217   const auto OpcodeName = Instr.Name;
218   if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
219     return "unsupported opcode: pseudo instruction";
220   if ((OpcodeName.starts_with("POP") && !OpcodeName.starts_with("POPCNT")) ||
221       OpcodeName.starts_with("PUSH") ||
222       OpcodeName.starts_with("ADJCALLSTACK") || OpcodeName.starts_with("LEAVE"))
223     return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
224   switch (Instr.Description.Opcode) {
225   case X86::LFS16rm:
226   case X86::LFS32rm:
227   case X86::LFS64rm:
228   case X86::LGS16rm:
229   case X86::LGS32rm:
230   case X86::LGS64rm:
231   case X86::LSS16rm:
232   case X86::LSS32rm:
233   case X86::LSS64rm:
234   case X86::SYSENTER:
235   case X86::WRFSBASE:
236   case X86::WRFSBASE64:
237     return "unsupported opcode";
238   default:
239     break;
240   }
241   if (const auto reason = isInvalidMemoryInstr(Instr))
242     return reason;
243   // We do not handle instructions with OPERAND_PCREL.
244   for (const Operand &Op : Instr.Operands)
245     if (Op.isExplicit() &&
246         Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
247       return "unsupported opcode: PC relative operand";
248   // We do not handle second-form X87 instructions. We only handle first-form
249   // ones (_Fp), see comment in X86InstrFPStack.td.
250   for (const Operand &Op : Instr.Operands)
251     if (Op.isReg() && Op.isExplicit() &&
252         Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
253       return "unsupported second-form X87 instruction";
254   return nullptr;
255 }
256 
257 static unsigned getX86FPFlags(const Instruction &Instr) {
258   return Instr.Description.TSFlags & X86II::FPTypeMask;
259 }
260 
261 // Helper to fill a memory operand with a value.
262 static void setMemOp(InstructionTemplate &IT, int OpIdx,
263                      const MCOperand &OpVal) {
264   const auto Op = IT.getInstr().Operands[OpIdx];
265   assert(Op.isExplicit() && "invalid memory pattern");
266   IT.getValueFor(Op) = OpVal;
267 }
268 
269 // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
270 // addressing base and index registers and returns the LEA destination register.
271 static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
272     const Instruction &Instr, const BitVector &ForbiddenRegisters,
273     const LLVMState &State, const SnippetGenerator::Options &Opts,
274     std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
275         RestrictDestRegs) {
276   assert(Instr.Operands.size() == 6 && "invalid LEA");
277   assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
278          "invalid LEA");
279 
280   constexpr const int kDestOp = 0;
281   constexpr const int kBaseOp = 1;
282   constexpr const int kIndexOp = 3;
283   auto PossibleDestRegs =
284       Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
285   remove(PossibleDestRegs, ForbiddenRegisters);
286   auto PossibleBaseRegs =
287       Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
288   remove(PossibleBaseRegs, ForbiddenRegisters);
289   auto PossibleIndexRegs =
290       Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
291   remove(PossibleIndexRegs, ForbiddenRegisters);
292 
293   const auto &RegInfo = State.getRegInfo();
294   std::vector<CodeTemplate> Result;
295   for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
296     for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
297       for (int LogScale = 0; LogScale <= 3; ++LogScale) {
298         // FIXME: Add an option for controlling how we explore immediates.
299         for (const int Disp : {0, 42}) {
300           InstructionTemplate IT(&Instr);
301           const int64_t Scale = 1ull << LogScale;
302           setMemOp(IT, 1, MCOperand::createReg(BaseReg));
303           setMemOp(IT, 2, MCOperand::createImm(Scale));
304           setMemOp(IT, 3, MCOperand::createReg(IndexReg));
305           setMemOp(IT, 4, MCOperand::createImm(Disp));
306           // SegmentReg must be 0 for LEA.
307           setMemOp(IT, 5, MCOperand::createReg(0));
308 
309           // Output reg candidates are selected by the caller.
310           auto PossibleDestRegsNow = PossibleDestRegs;
311           RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
312           assert(PossibleDestRegsNow.set_bits().begin() !=
313                      PossibleDestRegsNow.set_bits().end() &&
314                  "no remaining registers");
315           setMemOp(
316               IT, 0,
317               MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
318 
319           CodeTemplate CT;
320           CT.Instructions.push_back(std::move(IT));
321           CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
322                               RegInfo.getName(IndexReg), Scale, Disp)
323                           .str();
324           Result.push_back(std::move(CT));
325           if (Result.size() >= Opts.MaxConfigsPerOpcode)
326             return std::move(Result);
327         }
328       }
329     }
330   }
331 
332   return std::move(Result);
333 }
334 
335 namespace {
336 class X86SerialSnippetGenerator : public SerialSnippetGenerator {
337 public:
338   using SerialSnippetGenerator::SerialSnippetGenerator;
339 
340   Expected<std::vector<CodeTemplate>>
341   generateCodeTemplates(InstructionTemplate Variant,
342                         const BitVector &ForbiddenRegisters) const override;
343 };
344 } // namespace
345 
346 Expected<std::vector<CodeTemplate>>
347 X86SerialSnippetGenerator::generateCodeTemplates(
348     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
349   const Instruction &Instr = Variant.getInstr();
350 
351   if (const auto reason = isInvalidOpcode(Instr))
352     return make_error<Failure>(reason);
353 
354   // LEA gets special attention.
355   const auto Opcode = Instr.Description.getOpcode();
356   if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
357     return generateLEATemplatesCommon(
358         Instr, ForbiddenRegisters, State, Opts,
359         [this](unsigned BaseReg, unsigned IndexReg,
360                BitVector &CandidateDestRegs) {
361           // We just select a destination register that aliases the base
362           // register.
363           CandidateDestRegs &=
364               State.getRATC().getRegister(BaseReg).aliasedBits();
365         });
366   }
367 
368   if (Instr.hasMemoryOperands())
369     return make_error<Failure>(
370         "unsupported memory operand in latency measurements");
371 
372   switch (getX86FPFlags(Instr)) {
373   case X86II::NotFP:
374     return SerialSnippetGenerator::generateCodeTemplates(Variant,
375                                                          ForbiddenRegisters);
376   case X86II::ZeroArgFP:
377   case X86II::OneArgFP:
378   case X86II::SpecialFP:
379   case X86II::CompareFP:
380   case X86II::CondMovFP:
381     return make_error<Failure>("Unsupported x87 Instruction");
382   case X86II::OneArgFPRW:
383   case X86II::TwoArgFP:
384     // These are instructions like
385     //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
386     //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
387     // They are intrinsically serial and do not modify the state of the stack.
388     return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
389   default:
390     llvm_unreachable("Unknown FP Type!");
391   }
392 }
393 
394 namespace {
395 class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
396 public:
397   using ParallelSnippetGenerator::ParallelSnippetGenerator;
398 
399   Expected<std::vector<CodeTemplate>>
400   generateCodeTemplates(InstructionTemplate Variant,
401                         const BitVector &ForbiddenRegisters) const override;
402 };
403 
404 } // namespace
405 
406 Expected<std::vector<CodeTemplate>>
407 X86ParallelSnippetGenerator::generateCodeTemplates(
408     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
409   const Instruction &Instr = Variant.getInstr();
410 
411   if (const auto reason = isInvalidOpcode(Instr))
412     return make_error<Failure>(reason);
413 
414   // LEA gets special attention.
415   const auto Opcode = Instr.Description.getOpcode();
416   if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
417     return generateLEATemplatesCommon(
418         Instr, ForbiddenRegisters, State, Opts,
419         [this](unsigned BaseReg, unsigned IndexReg,
420                BitVector &CandidateDestRegs) {
421           // Any destination register that is not used for addressing is fine.
422           remove(CandidateDestRegs,
423                  State.getRATC().getRegister(BaseReg).aliasedBits());
424           remove(CandidateDestRegs,
425                  State.getRATC().getRegister(IndexReg).aliasedBits());
426         });
427   }
428 
429   switch (getX86FPFlags(Instr)) {
430   case X86II::NotFP:
431     return ParallelSnippetGenerator::generateCodeTemplates(Variant,
432                                                            ForbiddenRegisters);
433   case X86II::ZeroArgFP:
434   case X86II::OneArgFP:
435   case X86II::SpecialFP:
436     return make_error<Failure>("Unsupported x87 Instruction");
437   case X86II::OneArgFPRW:
438   case X86II::TwoArgFP:
439     // These are instructions like
440     //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
441     //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
442     // They are intrinsically serial and do not modify the state of the stack.
443     // We generate the same code for latency and uops.
444     return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
445   case X86II::CompareFP:
446   case X86II::CondMovFP:
447     // We can compute uops for any FP instruction that does not grow or shrink
448     // the stack (either do not touch the stack or push as much as they pop).
449     return generateUnconstrainedCodeTemplates(
450         Variant, "instruction does not grow/shrink the FP stack");
451   default:
452     llvm_unreachable("Unknown FP Type!");
453   }
454 }
455 
456 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
457   switch (RegBitWidth) {
458   case 8:
459     return X86::MOV8ri;
460   case 16:
461     return X86::MOV16ri;
462   case 32:
463     return X86::MOV32ri;
464   case 64:
465     return X86::MOV64ri;
466   }
467   llvm_unreachable("Invalid Value Width");
468 }
469 
470 // Generates instruction to load an immediate value into a register.
471 static MCInst loadImmediate(MCRegister Reg, unsigned RegBitWidth,
472                             const APInt &Value) {
473   if (Value.getBitWidth() > RegBitWidth)
474     llvm_unreachable("Value must fit in the Register");
475   return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
476       .addReg(Reg)
477       .addImm(Value.getZExtValue());
478 }
479 
480 // Allocates scratch memory on the stack.
481 static MCInst allocateStackSpace(unsigned Bytes) {
482   return MCInstBuilder(X86::SUB64ri8)
483       .addReg(X86::RSP)
484       .addReg(X86::RSP)
485       .addImm(Bytes);
486 }
487 
488 // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
489 static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
490                              uint64_t Imm) {
491   return MCInstBuilder(MovOpcode)
492       // Address = ESP
493       .addReg(X86::RSP)    // BaseReg
494       .addImm(1)           // ScaleAmt
495       .addReg(0)           // IndexReg
496       .addImm(OffsetBytes) // Disp
497       .addReg(0)           // Segment
498       // Immediate.
499       .addImm(Imm);
500 }
501 
502 // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
503 static MCInst loadToReg(MCRegister Reg, unsigned RMOpcode) {
504   return MCInstBuilder(RMOpcode)
505       .addReg(Reg)
506       // Address = ESP
507       .addReg(X86::RSP) // BaseReg
508       .addImm(1)        // ScaleAmt
509       .addReg(0)        // IndexReg
510       .addImm(0)        // Disp
511       .addReg(0);       // Segment
512 }
513 
514 // Releases scratch memory.
515 static MCInst releaseStackSpace(unsigned Bytes) {
516   return MCInstBuilder(X86::ADD64ri8)
517       .addReg(X86::RSP)
518       .addReg(X86::RSP)
519       .addImm(Bytes);
520 }
521 
522 // Reserves some space on the stack, fills it with the content of the provided
523 // constant and provide methods to load the stack value into a register.
524 namespace {
525 struct ConstantInliner {
526   explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
527 
528   std::vector<MCInst> loadAndFinalize(MCRegister Reg, unsigned RegBitWidth,
529                                       unsigned Opcode);
530 
531   std::vector<MCInst> loadX87STAndFinalize(MCRegister Reg);
532 
533   std::vector<MCInst> loadX87FPAndFinalize(MCRegister Reg);
534 
535   std::vector<MCInst> popFlagAndFinalize();
536 
537   std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
538                                                  unsigned Value);
539 
540   std::vector<MCInst> loadDirectionFlagAndFinalize();
541 
542 private:
543   ConstantInliner &add(const MCInst &Inst) {
544     Instructions.push_back(Inst);
545     return *this;
546   }
547 
548   void initStack(unsigned Bytes);
549 
550   static constexpr const unsigned kF80Bytes = 10; // 80 bits.
551 
552   APInt Constant_;
553   std::vector<MCInst> Instructions;
554 };
555 } // namespace
556 
557 std::vector<MCInst> ConstantInliner::loadAndFinalize(MCRegister Reg,
558                                                      unsigned RegBitWidth,
559                                                      unsigned Opcode) {
560   assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
561   initStack(RegBitWidth / 8);
562   add(loadToReg(Reg, Opcode));
563   add(releaseStackSpace(RegBitWidth / 8));
564   return std::move(Instructions);
565 }
566 
567 std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(MCRegister Reg) {
568   initStack(kF80Bytes);
569   add(MCInstBuilder(X86::LD_F80m)
570           // Address = ESP
571           .addReg(X86::RSP) // BaseReg
572           .addImm(1)        // ScaleAmt
573           .addReg(0)        // IndexReg
574           .addImm(0)        // Disp
575           .addReg(0));      // Segment
576   if (Reg != X86::ST0)
577     add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
578   add(releaseStackSpace(kF80Bytes));
579   return std::move(Instructions);
580 }
581 
582 std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(MCRegister Reg) {
583   initStack(kF80Bytes);
584   add(MCInstBuilder(X86::LD_Fp80m)
585           .addReg(Reg)
586           // Address = ESP
587           .addReg(X86::RSP) // BaseReg
588           .addImm(1)        // ScaleAmt
589           .addReg(0)        // IndexReg
590           .addImm(0)        // Disp
591           .addReg(0));      // Segment
592   add(releaseStackSpace(kF80Bytes));
593   return std::move(Instructions);
594 }
595 
596 std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
597   initStack(8);
598   add(MCInstBuilder(X86::POPF64));
599   return std::move(Instructions);
600 }
601 
602 std::vector<MCInst>
603 ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
604   add(allocateStackSpace(4));
605   add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
606   add(MCInstBuilder(Opcode)
607           // Address = ESP
608           .addReg(X86::RSP) // BaseReg
609           .addImm(1)        // ScaleAmt
610           .addReg(0)        // IndexReg
611           .addImm(0)        // Disp
612           .addReg(0));      // Segment
613   add(releaseStackSpace(4));
614   return std::move(Instructions);
615 }
616 
617 std::vector<MCInst> ConstantInliner::loadDirectionFlagAndFinalize() {
618   if (Constant_.isZero())
619     add(MCInstBuilder(X86::CLD));
620   else if (Constant_.isOne())
621     add(MCInstBuilder(X86::STD));
622 
623   return std::move(Instructions);
624 }
625 
626 void ConstantInliner::initStack(unsigned Bytes) {
627   assert(Constant_.getBitWidth() <= Bytes * 8 &&
628          "Value does not have the correct size");
629   const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
630                                  ? Constant_.sext(Bytes * 8)
631                                  : Constant_;
632   add(allocateStackSpace(Bytes));
633   size_t ByteOffset = 0;
634   for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
635     add(fillStackSpace(
636         X86::MOV32mi, ByteOffset,
637         WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
638   if (Bytes - ByteOffset >= 2) {
639     add(fillStackSpace(
640         X86::MOV16mi, ByteOffset,
641         WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
642     ByteOffset += 2;
643   }
644   if (Bytes - ByteOffset >= 1)
645     add(fillStackSpace(
646         X86::MOV8mi, ByteOffset,
647         WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
648 }
649 
650 #include "X86GenExegesis.inc"
651 
652 namespace {
653 
654 class X86SavedState : public ExegesisTarget::SavedState {
655 public:
656   X86SavedState() {
657 #if defined(_MSC_VER) && defined(_M_X64)
658     _fxsave64(FPState);
659     Eflags = __readeflags();
660 #elif defined(__GNUC__) && defined(__x86_64__)
661     __builtin_ia32_fxsave64(FPState);
662     Eflags = __builtin_ia32_readeflags_u64();
663 #else
664     report_fatal_error("X86 exegesis running on unsupported target");
665 #endif
666   }
667 
668   ~X86SavedState() {
669     // Restoring the X87 state does not flush pending exceptions, make sure
670     // these exceptions are flushed now.
671 #if defined(_MSC_VER) && defined(_M_X64)
672     _clearfp();
673     _fxrstor64(FPState);
674     __writeeflags(Eflags);
675 #elif defined(__GNUC__) && defined(__x86_64__)
676     asm volatile("fwait");
677     __builtin_ia32_fxrstor64(FPState);
678     __builtin_ia32_writeeflags_u64(Eflags);
679 #else
680     report_fatal_error("X86 exegesis running on unsupported target");
681 #endif
682   }
683 
684 private:
685 #if defined(__x86_64__) || defined(_M_X64)
686   alignas(16) char FPState[512];
687   uint64_t Eflags;
688 #endif
689 };
690 
691 class ExegesisX86Target : public ExegesisTarget {
692 public:
693   ExegesisX86Target()
694       : ExegesisTarget(X86CpuPfmCounters, X86_MC::isOpcodeAvailable) {}
695 
696   Expected<std::unique_ptr<pfm::CounterGroup>>
697   createCounter(StringRef CounterName, const LLVMState &State,
698                 ArrayRef<const char *> ValidationCounters,
699                 const pid_t ProcessID) const override {
700     // If LbrSamplingPeriod was provided, then ignore the
701     // CounterName because we only have one for LBR.
702     if (LbrSamplingPeriod > 0) {
703       // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
704       // __linux__ (for now)
705 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) &&                \
706     defined(__linux__)
707       // TODO(boomanaiden154): Add in support for using validation counters when
708       // using LBR counters.
709       if (ValidationCounters.size() > 0)
710         return make_error<StringError>(
711             "Using LBR is not currently supported with validation counters",
712             errc::invalid_argument);
713 
714       return std::make_unique<X86LbrCounter>(
715           X86LbrPerfEvent(LbrSamplingPeriod));
716 #else
717       return make_error<StringError>(
718           "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
719           "or running on Linux.",
720           errc::invalid_argument);
721 #endif
722     }
723     return ExegesisTarget::createCounter(CounterName, State, ValidationCounters,
724                                          ProcessID);
725   }
726 
727   enum ArgumentRegisters { CodeSize = X86::R12, AuxiliaryMemoryFD = X86::R13 };
728 
729 private:
730   void addTargetSpecificPasses(PassManagerBase &PM) const override;
731 
732   MCRegister getScratchMemoryRegister(const Triple &TT) const override;
733 
734   MCRegister getDefaultLoopCounterRegister(const Triple &) const override;
735 
736   unsigned getMaxMemoryAccessSize() const override { return 64; }
737 
738   Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
739                                  MCOperand &AssignedValue,
740                                  const BitVector &ForbiddenRegs) const override;
741 
742   void fillMemoryOperands(InstructionTemplate &IT, MCRegister Reg,
743                           unsigned Offset) const override;
744 
745   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
746                                    MachineBasicBlock &TargetMBB,
747                                    const MCInstrInfo &MII,
748                                    MCRegister LoopRegister) const override;
749 
750   std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
751                                const APInt &Value) const override;
752 
753 #ifdef __linux__
754   void generateLowerMunmap(std::vector<MCInst> &GeneratedCode) const override;
755 
756   void generateUpperMunmap(std::vector<MCInst> &GeneratedCode) const override;
757 
758   std::vector<MCInst> generateExitSyscall(unsigned ExitCode) const override;
759 
760   std::vector<MCInst>
761   generateMmap(uintptr_t Address, size_t Length,
762                uintptr_t FileDescriptorAddress) const override;
763 
764   void generateMmapAuxMem(std::vector<MCInst> &GeneratedCode) const override;
765 
766   void moveArgumentRegisters(std::vector<MCInst> &GeneratedCode) const override;
767 
768   std::vector<MCInst> generateMemoryInitialSetup() const override;
769 
770   std::vector<MCInst> setStackRegisterToAuxMem() const override;
771 
772   uintptr_t getAuxiliaryMemoryStartAddress() const override;
773 
774   std::vector<MCInst> configurePerfCounter(long Request, bool SaveRegisters) const override;
775 
776   std::vector<MCRegister> getArgumentRegisters() const override;
777 
778   std::vector<MCRegister> getRegistersNeedSaving() const override;
779 #endif // __linux__
780 
781   ArrayRef<MCPhysReg> getUnavailableRegisters() const override {
782     if (DisableUpperSSERegisters)
783       return ArrayRef(kUnavailableRegistersSSE);
784 
785     return ArrayRef(kUnavailableRegisters);
786   }
787 
788   bool allowAsBackToBack(const Instruction &Instr) const override {
789     const unsigned Opcode = Instr.Description.Opcode;
790     return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
791            Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
792   }
793 
794   std::vector<InstructionTemplate>
795   generateInstructionVariants(const Instruction &Instr,
796                               unsigned MaxConfigsPerOpcode) const override;
797 
798   std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
799       const LLVMState &State,
800       const SnippetGenerator::Options &Opts) const override {
801     return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
802   }
803 
804   std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
805       const LLVMState &State,
806       const SnippetGenerator::Options &Opts) const override {
807     return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
808   }
809 
810   bool matchesArch(Triple::ArchType Arch) const override {
811     return Arch == Triple::x86_64 || Arch == Triple::x86;
812   }
813 
814   Error checkFeatureSupport() const override {
815     // LBR is the only feature we conditionally support now.
816     // So if LBR is not requested, then we should be able to run the benchmarks.
817     if (LbrSamplingPeriod == 0)
818       return Error::success();
819 
820 #if defined(__linux__) && defined(HAVE_LIBPFM) &&                              \
821     defined(LIBPFM_HAS_FIELD_CYCLES)
822       // FIXME: Fix this.
823       // https://bugs.llvm.org/show_bug.cgi?id=48918
824       // For now, only do the check if we see an Intel machine because
825       // the counter uses some intel-specific magic and it could
826       // be confuse and think an AMD machine actually has LBR support.
827 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
828     defined(_M_X64)
829     using namespace sys::detail::x86;
830 
831     if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
832       // If the kernel supports it, the hardware still may not have it.
833       return X86LbrCounter::checkLbrSupport();
834 #else
835     report_fatal_error("Running X86 exegesis on unsupported target");
836 #endif
837 #endif
838     return make_error<StringError>(
839         "LBR not supported on this kernel and/or platform",
840         errc::not_supported);
841   }
842 
843   std::unique_ptr<SavedState> withSavedState() const override {
844     return std::make_unique<X86SavedState>();
845   }
846 
847   static const MCPhysReg kUnavailableRegisters[4];
848   static const MCPhysReg kUnavailableRegistersSSE[12];
849 };
850 
851 // We disable a few registers that cannot be encoded on instructions with a REX
852 // prefix.
853 const MCPhysReg ExegesisX86Target::kUnavailableRegisters[4] = {
854     X86::AH, X86::BH, X86::CH, X86::DH};
855 
856 // Optionally, also disable the upper (x86_64) SSE registers to reduce frontend
857 // decoder load.
858 const MCPhysReg ExegesisX86Target::kUnavailableRegistersSSE[12] = {
859     X86::AH,    X86::BH,    X86::CH,    X86::DH,    X86::XMM8,  X86::XMM9,
860     X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15};
861 
862 // We're using one of R8-R15 because these registers are never hardcoded in
863 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
864 // conflicts.
865 constexpr const MCPhysReg kDefaultLoopCounterReg = X86::R8;
866 
867 } // namespace
868 
869 void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
870   // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
871   PM.add(createX86FloatingPointStackifierPass());
872 }
873 
874 MCRegister ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
875   if (!TT.isArch64Bit()) {
876     // FIXME: This would require popping from the stack, so we would have to
877     // add some additional setup code.
878     return MCRegister();
879   }
880   return TT.isOSWindows() ? X86::RCX : X86::RDI;
881 }
882 
883 MCRegister
884 ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const {
885   if (!TT.isArch64Bit()) {
886     return MCRegister();
887   }
888   return kDefaultLoopCounterReg;
889 }
890 
891 Error ExegesisX86Target::randomizeTargetMCOperand(
892     const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
893     const BitVector &ForbiddenRegs) const {
894   const Operand &Op = Instr.getPrimaryOperand(Var);
895   switch (Op.getExplicitOperandInfo().OperandType) {
896   case X86::OperandType::OPERAND_COND_CODE:
897     AssignedValue =
898         MCOperand::createImm(randomIndex(X86::CondCode::LAST_VALID_COND));
899     return Error::success();
900   case X86::OperandType::OPERAND_ROUNDING_CONTROL:
901     AssignedValue =
902         MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
903     return Error::success();
904   default:
905     break;
906   }
907   return make_error<Failure>(
908       Twine("unimplemented operand type ")
909           .concat(Twine(Op.getExplicitOperandInfo().OperandType)));
910 }
911 
912 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
913                                            MCRegister Reg,
914                                            unsigned Offset) const {
915   assert(!isInvalidMemoryInstr(IT.getInstr()) &&
916          "fillMemoryOperands requires a valid memory instruction");
917   int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
918   assert(MemOpIdx >= 0 && "invalid memory operand index");
919   // getMemoryOperandNo() ignores tied operands, so we have to add them back.
920   MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
921   setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg));    // BaseReg
922   setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1));      // ScaleAmt
923   setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0));      // IndexReg
924   setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
925   setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0));      // Segment
926 }
927 
928 void ExegesisX86Target::decrementLoopCounterAndJump(
929     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
930     const MCInstrInfo &MII, MCRegister LoopRegister) const {
931   BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
932       .addDef(LoopRegister)
933       .addUse(LoopRegister)
934       .addImm(-1);
935   BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
936       .addMBB(&TargetMBB)
937       .addImm(X86::COND_NE);
938 }
939 
940 void generateRegisterStackPush(unsigned int Register,
941                                std::vector<MCInst> &GeneratedCode) {
942   GeneratedCode.push_back(MCInstBuilder(X86::PUSH64r).addReg(Register));
943 }
944 
945 void generateRegisterStackPop(unsigned int Register,
946                               std::vector<MCInst> &GeneratedCode) {
947   GeneratedCode.push_back(MCInstBuilder(X86::POP64r).addReg(Register));
948 }
949 
950 void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
951   GeneratedCode.push_back(
952       loadImmediate(X86::RAX, 64, APInt(64, SyscallNumber)));
953   GeneratedCode.push_back(MCInstBuilder(X86::SYSCALL));
954 }
955 
956 // The functions below for saving and restoring system call registers are only
957 // used when llvm-exegesis is built on Linux.
958 #ifdef __linux__
959 constexpr std::array<unsigned, 6> SyscallArgumentRegisters{
960     X86::RDI, X86::RSI, X86::RDX, X86::R10, X86::R8, X86::R9};
961 
962 static void saveSyscallRegisters(std::vector<MCInst> &GeneratedCode,
963                                  unsigned ArgumentCount) {
964   assert(ArgumentCount <= 6 &&
965          "System calls only X86-64 Linux can only take six arguments");
966   // Preserve RCX and R11 (Clobbered by the system call).
967   generateRegisterStackPush(X86::RCX, GeneratedCode);
968   generateRegisterStackPush(X86::R11, GeneratedCode);
969   // Preserve RAX (used for the syscall number/return value).
970   generateRegisterStackPush(X86::RAX, GeneratedCode);
971   // Preserve the registers used to pass arguments to the system call.
972   for (unsigned I = 0; I < ArgumentCount; ++I)
973     generateRegisterStackPush(SyscallArgumentRegisters[I], GeneratedCode);
974 }
975 
976 static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode,
977                                     unsigned ArgumentCount) {
978   assert(ArgumentCount <= 6 &&
979          "System calls only X86-64 Linux can only take six arguments");
980   // Restore the argument registers, in the opposite order of the way they are
981   // saved.
982   for (unsigned I = ArgumentCount; I > 0; --I) {
983     generateRegisterStackPop(SyscallArgumentRegisters[I - 1], GeneratedCode);
984   }
985   generateRegisterStackPop(X86::RAX, GeneratedCode);
986   generateRegisterStackPop(X86::R11, GeneratedCode);
987   generateRegisterStackPop(X86::RCX, GeneratedCode);
988 }
989 #endif // __linux__
990 
991 static std::vector<MCInst> loadImmediateSegmentRegister(MCRegister Reg,
992                                                         const APInt &Value) {
993 #if defined(__x86_64__) && defined(__linux__)
994   assert(Value.getBitWidth() <= 64 && "Value must fit in the register.");
995   std::vector<MCInst> loadSegmentRegisterCode;
996   // Preserve the syscall registers here as we don't
997   // want to make any assumptions about the ordering of what registers are
998   // loaded in first, and we might have already loaded in registers that we are
999   // going to be clobbering here.
1000   saveSyscallRegisters(loadSegmentRegisterCode, 2);
1001   // Generate the instructions to make the arch_prctl system call to set
1002   // the registers.
1003   int SyscallCode = 0;
1004   if (Reg == X86::FS)
1005     SyscallCode = ARCH_SET_FS;
1006   else if (Reg == X86::GS)
1007     SyscallCode = ARCH_SET_GS;
1008   else
1009     llvm_unreachable("Only the segment registers GS and FS are supported");
1010   loadSegmentRegisterCode.push_back(
1011       loadImmediate(X86::RDI, 64, APInt(64, SyscallCode)));
1012   loadSegmentRegisterCode.push_back(loadImmediate(X86::RSI, 64, Value));
1013   generateSyscall(SYS_arch_prctl, loadSegmentRegisterCode);
1014   // Restore the registers in reverse order
1015   restoreSyscallRegisters(loadSegmentRegisterCode, 2);
1016   return loadSegmentRegisterCode;
1017 #else
1018   llvm_unreachable("Loading immediate segment registers is only supported with "
1019                    "x86-64 llvm-exegesis");
1020 #endif // defined(__x86_64__) && defined(__linux__)
1021 }
1022 
1023 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
1024                                                 MCRegister Reg,
1025                                                 const APInt &Value) const {
1026   if (X86::SEGMENT_REGRegClass.contains(Reg))
1027     return loadImmediateSegmentRegister(Reg, Value);
1028   if (X86::GR8RegClass.contains(Reg))
1029     return {loadImmediate(Reg, 8, Value)};
1030   if (X86::GR16RegClass.contains(Reg))
1031     return {loadImmediate(Reg, 16, Value)};
1032   if (X86::GR32RegClass.contains(Reg))
1033     return {loadImmediate(Reg, 32, Value)};
1034   if (X86::GR64RegClass.contains(Reg))
1035     return {loadImmediate(Reg, 64, Value)};
1036   if (X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) ||
1037       X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg)) {
1038     switch (Value.getBitWidth()) {
1039     case 8:
1040       if (STI.getFeatureBits()[X86::FeatureDQI]) {
1041         ConstantInliner CI(Value);
1042         return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVBkm);
1043       }
1044       [[fallthrough]];
1045     case 16:
1046       if (STI.getFeatureBits()[X86::FeatureAVX512]) {
1047         ConstantInliner CI(Value.zextOrTrunc(16));
1048         return CI.loadAndFinalize(Reg, 16, X86::KMOVWkm);
1049       }
1050       break;
1051     case 32:
1052       if (STI.getFeatureBits()[X86::FeatureBWI]) {
1053         ConstantInliner CI(Value);
1054         return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVDkm);
1055       }
1056       break;
1057     case 64:
1058       if (STI.getFeatureBits()[X86::FeatureBWI]) {
1059         ConstantInliner CI(Value);
1060         return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVQkm);
1061       }
1062       break;
1063     }
1064   }
1065   ConstantInliner CI(Value);
1066   if (X86::VR64RegClass.contains(Reg))
1067     return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
1068   if (X86::VR128RegClass.contains(Reg)) {
1069     if (STI.getFeatureBits()[X86::FeatureAVX])
1070       return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
1071     return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
1072   }
1073   if (X86::VR128XRegClass.contains(Reg)) {
1074     if (STI.getFeatureBits()[X86::FeatureAVX512])
1075       return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
1076   }
1077   if (X86::VR256RegClass.contains(Reg)) {
1078     if (STI.getFeatureBits()[X86::FeatureAVX])
1079       return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
1080   }
1081   if (X86::VR256XRegClass.contains(Reg)) {
1082     if (STI.getFeatureBits()[X86::FeatureAVX512])
1083       return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
1084   }
1085   if (X86::VR512RegClass.contains(Reg))
1086     if (STI.getFeatureBits()[X86::FeatureAVX512])
1087       return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
1088   if (X86::RSTRegClass.contains(Reg)) {
1089     return CI.loadX87STAndFinalize(Reg);
1090   }
1091   if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
1092       X86::RFP80RegClass.contains(Reg)) {
1093     return CI.loadX87FPAndFinalize(Reg);
1094   }
1095   if (Reg == X86::EFLAGS)
1096     return CI.popFlagAndFinalize();
1097   if (Reg == X86::MXCSR)
1098     return CI.loadImplicitRegAndFinalize(
1099         STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
1100         0x1f80);
1101   if (Reg == X86::FPCW)
1102     return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
1103   if (Reg == X86::DF)
1104     return CI.loadDirectionFlagAndFinalize();
1105   return {}; // Not yet implemented.
1106 }
1107 
1108 #ifdef __linux__
1109 
1110 #ifdef __arm__
1111 static constexpr const uintptr_t VAddressSpaceCeiling = 0xC0000000;
1112 #else
1113 static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
1114 #endif
1115 
1116 void generateRoundToNearestPage(unsigned int Register,
1117                                 std::vector<MCInst> &GeneratedCode) {
1118   int PageSizeShift = static_cast<int>(round(log2(getpagesize())));
1119   // Round down to the nearest page by getting rid of the least significant bits
1120   // representing location in the page. Shift right to get rid of this info and
1121   // then shift back left.
1122   GeneratedCode.push_back(MCInstBuilder(X86::SHR64ri)
1123                               .addReg(Register)
1124                               .addReg(Register)
1125                               .addImm(PageSizeShift));
1126   GeneratedCode.push_back(MCInstBuilder(X86::SHL64ri)
1127                               .addReg(Register)
1128                               .addReg(Register)
1129                               .addImm(PageSizeShift));
1130 }
1131 
1132 void generateGetInstructionPointer(unsigned int ResultRegister,
1133                                    std::vector<MCInst> &GeneratedCode) {
1134   // Use a load effective address to get the current instruction pointer and put
1135   // it into the result register.
1136   GeneratedCode.push_back(MCInstBuilder(X86::LEA64r)
1137                               .addReg(ResultRegister)
1138                               .addReg(X86::RIP)
1139                               .addImm(1)
1140                               .addReg(0)
1141                               .addImm(0)
1142                               .addReg(0));
1143 }
1144 
1145 void ExegesisX86Target::generateLowerMunmap(
1146     std::vector<MCInst> &GeneratedCode) const {
1147   // Unmap starting at address zero
1148   GeneratedCode.push_back(loadImmediate(X86::RDI, 64, APInt(64, 0)));
1149   // Get the current instruction pointer so we know where to unmap up to.
1150   generateGetInstructionPointer(X86::RSI, GeneratedCode);
1151   generateRoundToNearestPage(X86::RSI, GeneratedCode);
1152   // Subtract a page from the end of the unmap so we don't unmap the currently
1153   // executing section.
1154   GeneratedCode.push_back(MCInstBuilder(X86::SUB64ri32)
1155                               .addReg(X86::RSI)
1156                               .addReg(X86::RSI)
1157                               .addImm(getpagesize()));
1158   generateSyscall(SYS_munmap, GeneratedCode);
1159 }
1160 
1161 void ExegesisX86Target::generateUpperMunmap(
1162     std::vector<MCInst> &GeneratedCode) const {
1163   generateGetInstructionPointer(X86::R8, GeneratedCode);
1164   // Load in the size of the snippet to RDI from from the argument register.
1165   GeneratedCode.push_back(MCInstBuilder(X86::MOV64rr)
1166                               .addReg(X86::RDI)
1167                               .addReg(ArgumentRegisters::CodeSize));
1168   // Add the length of the snippet (in %RDI) to the current instruction pointer
1169   // (%R8) to get the address where we should start unmapping at.
1170   GeneratedCode.push_back(MCInstBuilder(X86::ADD64rr)
1171                               .addReg(X86::RDI)
1172                               .addReg(X86::RDI)
1173                               .addReg(X86::R8));
1174   generateRoundToNearestPage(X86::RDI, GeneratedCode);
1175   // Add a one page to the start address to ensure that we're above the snippet
1176   // since the above function rounds down.
1177   GeneratedCode.push_back(MCInstBuilder(X86::ADD64ri32)
1178                               .addReg(X86::RDI)
1179                               .addReg(X86::RDI)
1180                               .addImm(getpagesize()));
1181   // Unmap to just one page under the ceiling of the address space.
1182   GeneratedCode.push_back(loadImmediate(
1183       X86::RSI, 64, APInt(64, VAddressSpaceCeiling - getpagesize())));
1184   GeneratedCode.push_back(MCInstBuilder(X86::SUB64rr)
1185                               .addReg(X86::RSI)
1186                               .addReg(X86::RSI)
1187                               .addReg(X86::RDI));
1188   generateSyscall(SYS_munmap, GeneratedCode);
1189 }
1190 
1191 std::vector<MCInst>
1192 ExegesisX86Target::generateExitSyscall(unsigned ExitCode) const {
1193   std::vector<MCInst> ExitCallCode;
1194   ExitCallCode.push_back(loadImmediate(X86::RDI, 64, APInt(64, ExitCode)));
1195   generateSyscall(SYS_exit, ExitCallCode);
1196   return ExitCallCode;
1197 }
1198 
1199 std::vector<MCInst>
1200 ExegesisX86Target::generateMmap(uintptr_t Address, size_t Length,
1201                                 uintptr_t FileDescriptorAddress) const {
1202   std::vector<MCInst> MmapCode;
1203   MmapCode.push_back(loadImmediate(X86::RDI, 64, APInt(64, Address)));
1204   MmapCode.push_back(loadImmediate(X86::RSI, 64, APInt(64, Length)));
1205   MmapCode.push_back(
1206       loadImmediate(X86::RDX, 64, APInt(64, PROT_READ | PROT_WRITE)));
1207   MmapCode.push_back(
1208       loadImmediate(X86::R10, 64, APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE)));
1209   // Copy file descriptor location from aux memory into R8
1210   MmapCode.push_back(
1211       loadImmediate(X86::R8, 64, APInt(64, FileDescriptorAddress)));
1212   // Dereference file descriptor into FD argument register
1213   MmapCode.push_back(MCInstBuilder(X86::MOV32rm)
1214                          .addReg(X86::R8D)
1215                          .addReg(X86::R8)
1216                          .addImm(1)
1217                          .addReg(0)
1218                          .addImm(0)
1219                          .addReg(0));
1220   MmapCode.push_back(loadImmediate(X86::R9, 64, APInt(64, 0)));
1221   generateSyscall(SYS_mmap, MmapCode);
1222   return MmapCode;
1223 }
1224 
1225 void ExegesisX86Target::generateMmapAuxMem(
1226     std::vector<MCInst> &GeneratedCode) const {
1227   GeneratedCode.push_back(
1228       loadImmediate(X86::RDI, 64, APInt(64, getAuxiliaryMemoryStartAddress())));
1229   GeneratedCode.push_back(loadImmediate(
1230       X86::RSI, 64, APInt(64, SubprocessMemory::AuxiliaryMemorySize)));
1231   GeneratedCode.push_back(
1232       loadImmediate(X86::RDX, 64, APInt(64, PROT_READ | PROT_WRITE)));
1233   GeneratedCode.push_back(
1234       loadImmediate(X86::R10, 64, APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE)));
1235   GeneratedCode.push_back(MCInstBuilder(X86::MOV64rr)
1236                               .addReg(X86::R8)
1237                               .addReg(ArgumentRegisters::AuxiliaryMemoryFD));
1238   GeneratedCode.push_back(loadImmediate(X86::R9, 64, APInt(64, 0)));
1239   generateSyscall(SYS_mmap, GeneratedCode);
1240 }
1241 
1242 void ExegesisX86Target::moveArgumentRegisters(
1243     std::vector<MCInst> &GeneratedCode) const {
1244   GeneratedCode.push_back(MCInstBuilder(X86::MOV64rr)
1245                               .addReg(ArgumentRegisters::CodeSize)
1246                               .addReg(X86::RDI));
1247   GeneratedCode.push_back(MCInstBuilder(X86::MOV64rr)
1248                               .addReg(ArgumentRegisters::AuxiliaryMemoryFD)
1249                               .addReg(X86::RSI));
1250 }
1251 
1252 std::vector<MCInst> ExegesisX86Target::generateMemoryInitialSetup() const {
1253   std::vector<MCInst> MemoryInitialSetupCode;
1254   moveArgumentRegisters(MemoryInitialSetupCode);
1255   generateLowerMunmap(MemoryInitialSetupCode);
1256   generateUpperMunmap(MemoryInitialSetupCode);
1257   generateMmapAuxMem(MemoryInitialSetupCode);
1258   return MemoryInitialSetupCode;
1259 }
1260 
1261 std::vector<MCInst> ExegesisX86Target::setStackRegisterToAuxMem() const {
1262   // Moves %rsp to the end of the auxiliary memory
1263   return {MCInstBuilder(X86::MOV64ri)
1264               .addReg(X86::RSP)
1265               .addImm(getAuxiliaryMemoryStartAddress() +
1266                       SubprocessMemory::AuxiliaryMemorySize)};
1267 }
1268 
1269 uintptr_t ExegesisX86Target::getAuxiliaryMemoryStartAddress() const {
1270   // Return the second to last page in the virtual address space to try and
1271   // prevent interference with memory annotations in the snippet
1272   return VAddressSpaceCeiling - 2 * getpagesize();
1273 }
1274 
1275 std::vector<MCInst>
1276 ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const {
1277   std::vector<MCInst> ConfigurePerfCounterCode;
1278   if (SaveRegisters)
1279     saveSyscallRegisters(ConfigurePerfCounterCode, 3);
1280   ConfigurePerfCounterCode.push_back(
1281       loadImmediate(X86::RDI, 64, APInt(64, getAuxiliaryMemoryStartAddress())));
1282   ConfigurePerfCounterCode.push_back(MCInstBuilder(X86::MOV32rm)
1283                                          .addReg(X86::EDI)
1284                                          .addReg(X86::RDI)
1285                                          .addImm(1)
1286                                          .addReg(0)
1287                                          .addImm(0)
1288                                          .addReg(0));
1289   ConfigurePerfCounterCode.push_back(
1290       loadImmediate(X86::RSI, 64, APInt(64, Request)));
1291 #ifdef HAVE_LIBPFM
1292   ConfigurePerfCounterCode.push_back(
1293       loadImmediate(X86::RDX, 64, APInt(64, PERF_IOC_FLAG_GROUP)));
1294 #endif // HAVE_LIBPFM
1295   generateSyscall(SYS_ioctl, ConfigurePerfCounterCode);
1296   if (SaveRegisters)
1297     restoreSyscallRegisters(ConfigurePerfCounterCode, 3);
1298   return ConfigurePerfCounterCode;
1299 }
1300 
1301 std::vector<MCRegister> ExegesisX86Target::getArgumentRegisters() const {
1302   return {X86::RDI, X86::RSI};
1303 }
1304 
1305 std::vector<MCRegister> ExegesisX86Target::getRegistersNeedSaving() const {
1306   return {X86::RAX, X86::RDI, X86::RSI, X86::RCX, X86::R11};
1307 }
1308 
1309 #endif // __linux__
1310 
1311 // Instruction can have some variable operands, and we may want to see how
1312 // different operands affect performance. So for each operand position,
1313 // precompute all the possible choices we might care about,
1314 // and greedily generate all the possible combinations of choices.
1315 std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
1316     const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
1317   bool Exploration = false;
1318   SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
1319   VariableChoices.resize(Instr.Variables.size());
1320   for (auto I : zip(Instr.Variables, VariableChoices)) {
1321     const Variable &Var = std::get<0>(I);
1322     SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
1323 
1324     switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
1325     default:
1326       // We don't wish to explicitly explore this variable.
1327       Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
1328       continue;
1329     case X86::OperandType::OPERAND_COND_CODE: {
1330       Exploration = true;
1331       auto CondCodes = enum_seq_inclusive(X86::CondCode::COND_O,
1332                                           X86::CondCode::LAST_VALID_COND,
1333                                           force_iteration_on_noniterable_enum);
1334       Choices.reserve(CondCodes.size());
1335       for (int CondCode : CondCodes)
1336         Choices.emplace_back(MCOperand::createImm(CondCode));
1337       break;
1338     }
1339     }
1340   }
1341 
1342   // If we don't wish to explore any variables, defer to the baseline method.
1343   if (!Exploration)
1344     return ExegesisTarget::generateInstructionVariants(Instr,
1345                                                        MaxConfigsPerOpcode);
1346 
1347   std::vector<InstructionTemplate> Variants;
1348   size_t NumVariants;
1349   CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
1350       VariableChoices);
1351 
1352   // How many operand combinations can we produce, within the limit?
1353   NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
1354   // And actually produce all the wanted operand combinations.
1355   Variants.reserve(NumVariants);
1356   G.generate([&](ArrayRef<MCOperand> State) -> bool {
1357     Variants.emplace_back(&Instr);
1358     Variants.back().setVariableValues(State);
1359     // Did we run out of space for variants?
1360     return Variants.size() >= NumVariants;
1361   });
1362 
1363   assert(Variants.size() == NumVariants &&
1364          Variants.size() <= MaxConfigsPerOpcode &&
1365          "Should not produce too many variants");
1366   return Variants;
1367 }
1368 
1369 static ExegesisTarget *getTheExegesisX86Target() {
1370   static ExegesisX86Target Target;
1371   return &Target;
1372 }
1373 
1374 void InitializeX86ExegesisTarget() {
1375   ExegesisTarget::registerTarget(getTheExegesisX86Target());
1376 }
1377 
1378 } // namespace exegesis
1379 } // namespace llvm
1380