xref: /llvm-project/llvm/lib/Target/X86/X86FixupInstTuning.cpp (revision 2ffdfb5f9dff92b10aae1d852ee3935ad90d36d1)
1 //===-- X86FixupInstTunings.cpp - replace instructions -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file does a tuning pass replacing slower machine instructions
10 // with faster ones. We do this here, as opposed to during normal ISel, as
11 // attempting to get the "right" instruction can break patterns. This pass
12 // is not meant search for special cases where an instruction can be transformed
13 // to another, it is only meant to do transformations where the old instruction
14 // is always replacable with the new instructions. For example:
15 //
16 //      `vpermq ymm` -> `vshufd ymm`
17 //          -- BAD, not always valid (lane cross/non-repeated mask)
18 //
19 //      `vpermilps ymm` -> `vshufd ymm`
20 //          -- GOOD, always replaceable
21 //
22 //===----------------------------------------------------------------------===//
23 
24 #include "X86.h"
25 #include "X86InstrInfo.h"
26 #include "X86Subtarget.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 #include "llvm/CodeGen/MachineRegisterInfo.h"
31 
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "x86-fixup-inst-tuning"
35 
36 STATISTIC(NumInstChanges, "Number of instructions changes");
37 
38 namespace {
39 class X86FixupInstTuningPass : public MachineFunctionPass {
40 public:
41   static char ID;
42 
43   X86FixupInstTuningPass() : MachineFunctionPass(ID) {}
44 
45   StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; }
46 
47   bool runOnMachineFunction(MachineFunction &MF) override;
48   bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB,
49                           MachineBasicBlock::iterator &I);
50 
51   // This pass runs after regalloc and doesn't support VReg operands.
52   MachineFunctionProperties getRequiredProperties() const override {
53     return MachineFunctionProperties().set(
54         MachineFunctionProperties::Property::NoVRegs);
55   }
56 
57 private:
58   const X86InstrInfo *TII = nullptr;
59   const X86Subtarget *ST = nullptr;
60   const MCSchedModel *SM = nullptr;
61 };
62 } // end anonymous namespace
63 
64 char X86FixupInstTuningPass::ID = 0;
65 
66 INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false)
67 
68 FunctionPass *llvm::createX86FixupInstTuning() {
69   return new X86FixupInstTuningPass();
70 }
71 
72 template <typename T>
73 static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
74   if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
75     return *NewVal < *CurVal;
76 
77   return std::nullopt;
78 }
79 
80 bool X86FixupInstTuningPass::processInstruction(
81     MachineFunction &MF, MachineBasicBlock &MBB,
82     MachineBasicBlock::iterator &I) {
83   MachineInstr &MI = *I;
84   unsigned Opc = MI.getOpcode();
85   unsigned NumOperands = MI.getDesc().getNumOperands();
86 
87   auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
88     // We already checked that SchedModel exists in `NewOpcPreferable`.
89     return MCSchedModel::getReciprocalThroughput(
90         *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
91   };
92 
93   auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
94     // We already checked that SchedModel exists in `NewOpcPreferable`.
95     return MCSchedModel::computeInstrLatency(
96         *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
97   };
98 
99   auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
100     if (unsigned Size = TII->get(Opcode).getSize())
101       return Size;
102     // Zero size means we where unable to compute it.
103     return std::nullopt;
104   };
105 
106   auto NewOpcPreferable = [&](unsigned NewOpc,
107                               bool ReplaceInTie = true) -> bool {
108     std::optional<bool> Res;
109     if (SM->hasInstrSchedModel()) {
110       // Compare tput -> lat -> code size.
111       Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
112       if (Res.has_value())
113         return *Res;
114 
115       Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
116       if (Res.has_value())
117         return *Res;
118     }
119 
120     Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
121     if (Res.has_value())
122       return *Res;
123 
124     // We either have either were unable to get tput/lat/codesize or all values
125     // were equal. Return specified option for a tie.
126     return ReplaceInTie;
127   };
128 
129   // `vpermilps r, i` -> `vshufps r, r, i`
130   // `vpermilps r, i, k` -> `vshufps r, r, i, k`
131   // `vshufps` is always as fast or faster than `vpermilps` and takes
132   // 1 less byte of code size for VEX and SSE encoding.
133   auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
134     if (!NewOpcPreferable(NewOpc))
135       return false;
136     unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
137     MI.removeOperand(NumOperands - 1);
138     MI.addOperand(MI.getOperand(NumOperands - 2));
139     MI.setDesc(TII->get(NewOpc));
140     MI.addOperand(MachineOperand::CreateImm(MaskImm));
141     return true;
142   };
143 
144   // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles.
145   // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less
146   // byte of code size.
147   auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
148     // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
149     // `vpshufd` saves a byte of code size.
150     if (!ST->hasNoDomainDelayShuffle() &&
151         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
152       return false;
153     MI.setDesc(TII->get(NewOpc));
154     return true;
155   };
156 
157   // `vunpcklpd/vmovlhps r, r` -> `vshufpd r, r, 0x00`
158   // `vunpckhpd/vmovlhps r, r` -> `vshufpd r, r, 0xff`
159   // `vunpcklpd r, r, k` -> `vshufpd r, r, 0x00`
160   // `vunpckhpd r, r, k` -> `vshufpd r, r, 0xff`
161   // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with
162   // `vunpck{l|h}pd` as it uses less code size.
163   // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS`
164   // as the replacement. `{VP}UNPCK{L|H}QDQ{...}` has no codesize cost.
165   auto ProcessUNPCKPD = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
166     if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
167       return false;
168 
169     MI.setDesc(TII->get(NewOpc));
170     MI.addOperand(MachineOperand::CreateImm(MaskImm));
171     return true;
172   };
173 
174   auto ProcessUNPCKLPDrr = [&](unsigned NewOpc) -> bool {
175     return ProcessUNPCKPD(NewOpc, 0x00);
176   };
177   auto ProcessUNPCKHPDrr = [&](unsigned NewOpc) -> bool {
178     return ProcessUNPCKPD(NewOpc, 0xff);
179   };
180 
181   switch (Opc) {
182   case X86::VPERMILPSri:
183     return ProcessVPERMILPSri(X86::VSHUFPSrri);
184   case X86::VPERMILPSYri:
185     return ProcessVPERMILPSri(X86::VSHUFPSYrri);
186   case X86::VPERMILPSZ128ri:
187     return ProcessVPERMILPSri(X86::VSHUFPSZ128rri);
188   case X86::VPERMILPSZ256ri:
189     return ProcessVPERMILPSri(X86::VSHUFPSZ256rri);
190   case X86::VPERMILPSZri:
191     return ProcessVPERMILPSri(X86::VSHUFPSZrri);
192   case X86::VPERMILPSZ128rikz:
193     return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz);
194   case X86::VPERMILPSZ256rikz:
195     return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz);
196   case X86::VPERMILPSZrikz:
197     return ProcessVPERMILPSri(X86::VSHUFPSZrrikz);
198   case X86::VPERMILPSZ128rik:
199     return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik);
200   case X86::VPERMILPSZ256rik:
201     return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik);
202   case X86::VPERMILPSZrik:
203     return ProcessVPERMILPSri(X86::VSHUFPSZrrik);
204   case X86::VPERMILPSmi:
205     return ProcessVPERMILPSmi(X86::VPSHUFDmi);
206   case X86::VPERMILPSYmi:
207     // TODO: See if there is a more generic way we can test if the replacement
208     // instruction is supported.
209     return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false;
210   case X86::VPERMILPSZ128mi:
211     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi);
212   case X86::VPERMILPSZ256mi:
213     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi);
214   case X86::VPERMILPSZmi:
215     return ProcessVPERMILPSmi(X86::VPSHUFDZmi);
216   case X86::VPERMILPSZ128mikz:
217     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz);
218   case X86::VPERMILPSZ256mikz:
219     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz);
220   case X86::VPERMILPSZmikz:
221     return ProcessVPERMILPSmi(X86::VPSHUFDZmikz);
222   case X86::VPERMILPSZ128mik:
223     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik);
224   case X86::VPERMILPSZ256mik:
225     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik);
226   case X86::VPERMILPSZmik:
227     return ProcessVPERMILPSmi(X86::VPSHUFDZmik);
228 
229     // TODO: {V}UNPCK{L|H}PD{...} is probably safe to transform to
230     // `{VP}UNPCK{L|H}QDQ{...}` which gets the same perf benefit as
231     // `{V}SHUF{...}PS` but 1) without increasing code size and 2) can also
232     // handle the `mr` case. ICL doesn't have a domain penalty for replacing
233     // float unpck -> int unpck, but at this time, I haven't verified the set of
234     // processors where its safe.
235   case X86::MOVLHPSrr:
236   case X86::UNPCKLPDrr:
237     return ProcessUNPCKLPDrr(X86::SHUFPDrri);
238   case X86::VMOVLHPSrr:
239   case X86::VUNPCKLPDrr:
240     return ProcessUNPCKLPDrr(X86::VSHUFPDrri);
241   case X86::VUNPCKLPDYrr:
242     return ProcessUNPCKLPDrr(X86::VSHUFPDYrri);
243     // VMOVLHPS is always 128 bits.
244   case X86::VMOVLHPSZrr:
245   case X86::VUNPCKLPDZ128rr:
246     return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rri);
247   case X86::VUNPCKLPDZ256rr:
248     return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rri);
249   case X86::VUNPCKLPDZrr:
250     return ProcessUNPCKLPDrr(X86::VSHUFPDZrri);
251   case X86::VUNPCKLPDZ128rrk:
252     return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrik);
253   case X86::VUNPCKLPDZ256rrk:
254     return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrik);
255   case X86::VUNPCKLPDZrrk:
256     return ProcessUNPCKLPDrr(X86::VSHUFPDZrrik);
257   case X86::VUNPCKLPDZ128rrkz:
258     return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrikz);
259   case X86::VUNPCKLPDZ256rrkz:
260     return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrikz);
261   case X86::VUNPCKLPDZrrkz:
262     return ProcessUNPCKLPDrr(X86::VSHUFPDZrrikz);
263   case X86::UNPCKHPDrr:
264     return ProcessUNPCKHPDrr(X86::SHUFPDrri);
265   case X86::VUNPCKHPDrr:
266     return ProcessUNPCKHPDrr(X86::VSHUFPDrri);
267   case X86::VUNPCKHPDYrr:
268     return ProcessUNPCKHPDrr(X86::VSHUFPDYrri);
269   case X86::VUNPCKHPDZ128rr:
270     return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rri);
271   case X86::VUNPCKHPDZ256rr:
272     return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rri);
273   case X86::VUNPCKHPDZrr:
274     return ProcessUNPCKHPDrr(X86::VSHUFPDZrri);
275   case X86::VUNPCKHPDZ128rrk:
276     return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrik);
277   case X86::VUNPCKHPDZ256rrk:
278     return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrik);
279   case X86::VUNPCKHPDZrrk:
280     return ProcessUNPCKHPDrr(X86::VSHUFPDZrrik);
281   case X86::VUNPCKHPDZ128rrkz:
282     return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrikz);
283   case X86::VUNPCKHPDZ256rrkz:
284     return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrikz);
285   case X86::VUNPCKHPDZrrkz:
286     return ProcessUNPCKHPDrr(X86::VSHUFPDZrrikz);
287   default:
288     return false;
289   }
290 }
291 
292 bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
293   LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";);
294   bool Changed = false;
295   ST = &MF.getSubtarget<X86Subtarget>();
296   TII = ST->getInstrInfo();
297   SM = &ST->getSchedModel();
298 
299   for (MachineBasicBlock &MBB : MF) {
300     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
301       if (processInstruction(MF, MBB, I)) {
302         ++NumInstChanges;
303         Changed = true;
304       }
305     }
306   }
307   LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";);
308   return Changed;
309 }
310