xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/X86/X86FixupInstTuning.cpp (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1*06c3fb27SDimitry Andric //===-- X86FixupInstTunings.cpp - replace instructions -----------===//
2*06c3fb27SDimitry Andric //
3*06c3fb27SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*06c3fb27SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*06c3fb27SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*06c3fb27SDimitry Andric //
7*06c3fb27SDimitry Andric //===----------------------------------------------------------------------===//
8*06c3fb27SDimitry Andric //
9*06c3fb27SDimitry Andric // This file does a tuning pass replacing slower machine instructions
10*06c3fb27SDimitry Andric // with faster ones. We do this here, as opposed to during normal ISel, as
11*06c3fb27SDimitry Andric // attempting to get the "right" instruction can break patterns. This pass
12*06c3fb27SDimitry Andric // is not meant search for special cases where an instruction can be transformed
13*06c3fb27SDimitry Andric // to another, it is only meant to do transformations where the old instruction
14*06c3fb27SDimitry Andric // is always replacable with the new instructions. For example:
15*06c3fb27SDimitry Andric //
16*06c3fb27SDimitry Andric //      `vpermq ymm` -> `vshufd ymm`
17*06c3fb27SDimitry Andric //          -- BAD, not always valid (lane cross/non-repeated mask)
18*06c3fb27SDimitry Andric //
19*06c3fb27SDimitry Andric //      `vpermilps ymm` -> `vshufd ymm`
20*06c3fb27SDimitry Andric //          -- GOOD, always replaceable
21*06c3fb27SDimitry Andric //
22*06c3fb27SDimitry Andric //===----------------------------------------------------------------------===//
23*06c3fb27SDimitry Andric 
24*06c3fb27SDimitry Andric #include "X86.h"
25*06c3fb27SDimitry Andric #include "X86InstrInfo.h"
26*06c3fb27SDimitry Andric #include "X86Subtarget.h"
27*06c3fb27SDimitry Andric #include "llvm/ADT/Statistic.h"
28*06c3fb27SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
29*06c3fb27SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
30*06c3fb27SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
31*06c3fb27SDimitry Andric 
32*06c3fb27SDimitry Andric using namespace llvm;
33*06c3fb27SDimitry Andric 
34*06c3fb27SDimitry Andric #define DEBUG_TYPE "x86-fixup-inst-tuning"
35*06c3fb27SDimitry Andric 
36*06c3fb27SDimitry Andric STATISTIC(NumInstChanges, "Number of instructions changes");
37*06c3fb27SDimitry Andric 
38*06c3fb27SDimitry Andric namespace {
39*06c3fb27SDimitry Andric class X86FixupInstTuningPass : public MachineFunctionPass {
40*06c3fb27SDimitry Andric public:
41*06c3fb27SDimitry Andric   static char ID;
42*06c3fb27SDimitry Andric 
X86FixupInstTuningPass()43*06c3fb27SDimitry Andric   X86FixupInstTuningPass() : MachineFunctionPass(ID) {}
44*06c3fb27SDimitry Andric 
getPassName() const45*06c3fb27SDimitry Andric   StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; }
46*06c3fb27SDimitry Andric 
47*06c3fb27SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
48*06c3fb27SDimitry Andric   bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB,
49*06c3fb27SDimitry Andric                           MachineBasicBlock::iterator &I);
50*06c3fb27SDimitry Andric 
51*06c3fb27SDimitry Andric   // This pass runs after regalloc and doesn't support VReg operands.
getRequiredProperties() const52*06c3fb27SDimitry Andric   MachineFunctionProperties getRequiredProperties() const override {
53*06c3fb27SDimitry Andric     return MachineFunctionProperties().set(
54*06c3fb27SDimitry Andric         MachineFunctionProperties::Property::NoVRegs);
55*06c3fb27SDimitry Andric   }
56*06c3fb27SDimitry Andric 
57*06c3fb27SDimitry Andric private:
58*06c3fb27SDimitry Andric   const X86InstrInfo *TII = nullptr;
59*06c3fb27SDimitry Andric   const X86Subtarget *ST = nullptr;
60*06c3fb27SDimitry Andric   const MCSchedModel *SM = nullptr;
61*06c3fb27SDimitry Andric };
62*06c3fb27SDimitry Andric } // end anonymous namespace
63*06c3fb27SDimitry Andric 
64*06c3fb27SDimitry Andric char X86FixupInstTuningPass::ID = 0;
65*06c3fb27SDimitry Andric 
INITIALIZE_PASS(X86FixupInstTuningPass,DEBUG_TYPE,DEBUG_TYPE,false,false)66*06c3fb27SDimitry Andric INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false)
67*06c3fb27SDimitry Andric 
68*06c3fb27SDimitry Andric FunctionPass *llvm::createX86FixupInstTuning() {
69*06c3fb27SDimitry Andric   return new X86FixupInstTuningPass();
70*06c3fb27SDimitry Andric }
71*06c3fb27SDimitry Andric 
72*06c3fb27SDimitry Andric template <typename T>
CmpOptionals(T NewVal,T CurVal)73*06c3fb27SDimitry Andric static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
74*06c3fb27SDimitry Andric   if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
75*06c3fb27SDimitry Andric     return *NewVal < *CurVal;
76*06c3fb27SDimitry Andric 
77*06c3fb27SDimitry Andric   return std::nullopt;
78*06c3fb27SDimitry Andric }
79*06c3fb27SDimitry Andric 
processInstruction(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator & I)80*06c3fb27SDimitry Andric bool X86FixupInstTuningPass::processInstruction(
81*06c3fb27SDimitry Andric     MachineFunction &MF, MachineBasicBlock &MBB,
82*06c3fb27SDimitry Andric     MachineBasicBlock::iterator &I) {
83*06c3fb27SDimitry Andric   MachineInstr &MI = *I;
84*06c3fb27SDimitry Andric   unsigned Opc = MI.getOpcode();
85*06c3fb27SDimitry Andric   unsigned NumOperands = MI.getDesc().getNumOperands();
86*06c3fb27SDimitry Andric 
87*06c3fb27SDimitry Andric   auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
88*06c3fb27SDimitry Andric     // We already checked that SchedModel exists in `NewOpcPreferable`.
89*06c3fb27SDimitry Andric     return MCSchedModel::getReciprocalThroughput(
90*06c3fb27SDimitry Andric         *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
91*06c3fb27SDimitry Andric   };
92*06c3fb27SDimitry Andric 
93*06c3fb27SDimitry Andric   auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
94*06c3fb27SDimitry Andric     // We already checked that SchedModel exists in `NewOpcPreferable`.
95*06c3fb27SDimitry Andric     return MCSchedModel::computeInstrLatency(
96*06c3fb27SDimitry Andric         *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
97*06c3fb27SDimitry Andric   };
98*06c3fb27SDimitry Andric 
99*06c3fb27SDimitry Andric   auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
100*06c3fb27SDimitry Andric     if (unsigned Size = TII->get(Opcode).getSize())
101*06c3fb27SDimitry Andric       return Size;
102*06c3fb27SDimitry Andric     // Zero size means we where unable to compute it.
103*06c3fb27SDimitry Andric     return std::nullopt;
104*06c3fb27SDimitry Andric   };
105*06c3fb27SDimitry Andric 
106*06c3fb27SDimitry Andric   auto NewOpcPreferable = [&](unsigned NewOpc,
107*06c3fb27SDimitry Andric                               bool ReplaceInTie = true) -> bool {
108*06c3fb27SDimitry Andric     std::optional<bool> Res;
109*06c3fb27SDimitry Andric     if (SM->hasInstrSchedModel()) {
110*06c3fb27SDimitry Andric       // Compare tput -> lat -> code size.
111*06c3fb27SDimitry Andric       Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
112*06c3fb27SDimitry Andric       if (Res.has_value())
113*06c3fb27SDimitry Andric         return *Res;
114*06c3fb27SDimitry Andric 
115*06c3fb27SDimitry Andric       Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
116*06c3fb27SDimitry Andric       if (Res.has_value())
117*06c3fb27SDimitry Andric         return *Res;
118*06c3fb27SDimitry Andric     }
119*06c3fb27SDimitry Andric 
120*06c3fb27SDimitry Andric     Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
121*06c3fb27SDimitry Andric     if (Res.has_value())
122*06c3fb27SDimitry Andric       return *Res;
123*06c3fb27SDimitry Andric 
124*06c3fb27SDimitry Andric     // We either have either were unable to get tput/lat/codesize or all values
125*06c3fb27SDimitry Andric     // were equal. Return specified option for a tie.
126*06c3fb27SDimitry Andric     return ReplaceInTie;
127*06c3fb27SDimitry Andric   };
128*06c3fb27SDimitry Andric 
129*06c3fb27SDimitry Andric   // `vpermilpd r, i` -> `vshufpd r, r, i`
130*06c3fb27SDimitry Andric   // `vpermilpd r, i, k` -> `vshufpd r, r, i, k`
131*06c3fb27SDimitry Andric   // `vshufpd` is always as fast or faster than `vpermilpd` and takes
132*06c3fb27SDimitry Andric   // 1 less byte of code size for VEX and EVEX encoding.
133*06c3fb27SDimitry Andric   auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool {
134*06c3fb27SDimitry Andric     if (!NewOpcPreferable(NewOpc))
135*06c3fb27SDimitry Andric       return false;
136*06c3fb27SDimitry Andric     unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
137*06c3fb27SDimitry Andric     MI.removeOperand(NumOperands - 1);
138*06c3fb27SDimitry Andric     MI.addOperand(MI.getOperand(NumOperands - 2));
139*06c3fb27SDimitry Andric     MI.setDesc(TII->get(NewOpc));
140*06c3fb27SDimitry Andric     MI.addOperand(MachineOperand::CreateImm(MaskImm));
141*06c3fb27SDimitry Andric     return true;
142*06c3fb27SDimitry Andric   };
143*06c3fb27SDimitry Andric 
144*06c3fb27SDimitry Andric   // `vpermilps r, i` -> `vshufps r, r, i`
145*06c3fb27SDimitry Andric   // `vpermilps r, i, k` -> `vshufps r, r, i, k`
146*06c3fb27SDimitry Andric   // `vshufps` is always as fast or faster than `vpermilps` and takes
147*06c3fb27SDimitry Andric   // 1 less byte of code size for VEX and EVEX encoding.
148*06c3fb27SDimitry Andric   auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
149*06c3fb27SDimitry Andric     if (!NewOpcPreferable(NewOpc))
150*06c3fb27SDimitry Andric       return false;
151*06c3fb27SDimitry Andric     unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
152*06c3fb27SDimitry Andric     MI.removeOperand(NumOperands - 1);
153*06c3fb27SDimitry Andric     MI.addOperand(MI.getOperand(NumOperands - 2));
154*06c3fb27SDimitry Andric     MI.setDesc(TII->get(NewOpc));
155*06c3fb27SDimitry Andric     MI.addOperand(MachineOperand::CreateImm(MaskImm));
156*06c3fb27SDimitry Andric     return true;
157*06c3fb27SDimitry Andric   };
158*06c3fb27SDimitry Andric 
159*06c3fb27SDimitry Andric   // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles.
160*06c3fb27SDimitry Andric   // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less
161*06c3fb27SDimitry Andric   // byte of code size.
162*06c3fb27SDimitry Andric   auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
163*06c3fb27SDimitry Andric     // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
164*06c3fb27SDimitry Andric     // `vpshufd` saves a byte of code size.
165*06c3fb27SDimitry Andric     if (!ST->hasNoDomainDelayShuffle() ||
166*06c3fb27SDimitry Andric         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
167*06c3fb27SDimitry Andric       return false;
168*06c3fb27SDimitry Andric     MI.setDesc(TII->get(NewOpc));
169*06c3fb27SDimitry Andric     return true;
170*06c3fb27SDimitry Andric   };
171*06c3fb27SDimitry Andric 
172*06c3fb27SDimitry Andric   // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00`
173*06c3fb27SDimitry Andric   // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff`
174*06c3fb27SDimitry Andric   // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00`
175*06c3fb27SDimitry Andric   // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff`
176*06c3fb27SDimitry Andric   // `vunpcklpd r, m` -> `vunpcklqdq r, m, k`
177*06c3fb27SDimitry Andric   // `vunpckhpd r, m` -> `vunpckhqdq r, m, k`
178*06c3fb27SDimitry Andric   // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k`
179*06c3fb27SDimitry Andric   // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k`
180*06c3fb27SDimitry Andric   // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd`
181*06c3fb27SDimitry Andric   //        -> `vunpck{l|h}qdq`
182*06c3fb27SDimitry Andric   // 2) If `vshufpd` faster than `vunpck{l|h}pd`
183*06c3fb27SDimitry Andric   //        -> `vshufpd`
184*06c3fb27SDimitry Andric   //
185*06c3fb27SDimitry Andric   // `vunpcklps` -> `vunpckldq` (for all operand types if no bypass delay)
186*06c3fb27SDimitry Andric   auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
187*06c3fb27SDimitry Andric     if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
188*06c3fb27SDimitry Andric       return false;
189*06c3fb27SDimitry Andric 
190*06c3fb27SDimitry Andric     MI.setDesc(TII->get(NewOpc));
191*06c3fb27SDimitry Andric     MI.addOperand(MachineOperand::CreateImm(MaskImm));
192*06c3fb27SDimitry Andric     return true;
193*06c3fb27SDimitry Andric   };
194*06c3fb27SDimitry Andric 
195*06c3fb27SDimitry Andric   auto ProcessUNPCKToIntDomain = [&](unsigned NewOpc) -> bool {
196*06c3fb27SDimitry Andric     // TODO it may be worth it to set ReplaceInTie to `true` as there is no real
197*06c3fb27SDimitry Andric     // downside to the integer unpck, but if someone doesn't specify exact
198*06c3fb27SDimitry Andric     // target we won't find it faster.
199*06c3fb27SDimitry Andric     if (!ST->hasNoDomainDelayShuffle() ||
200*06c3fb27SDimitry Andric         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
201*06c3fb27SDimitry Andric       return false;
202*06c3fb27SDimitry Andric     MI.setDesc(TII->get(NewOpc));
203*06c3fb27SDimitry Andric     return true;
204*06c3fb27SDimitry Andric   };
205*06c3fb27SDimitry Andric 
206*06c3fb27SDimitry Andric   auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain,
207*06c3fb27SDimitry Andric                                unsigned NewOpc) -> bool {
208*06c3fb27SDimitry Andric     if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
209*06c3fb27SDimitry Andric       return true;
210*06c3fb27SDimitry Andric     return ProcessUNPCK(NewOpc, 0x00);
211*06c3fb27SDimitry Andric   };
212*06c3fb27SDimitry Andric   auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain,
213*06c3fb27SDimitry Andric                                unsigned NewOpc) -> bool {
214*06c3fb27SDimitry Andric     if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
215*06c3fb27SDimitry Andric       return true;
216*06c3fb27SDimitry Andric     return ProcessUNPCK(NewOpc, 0xff);
217*06c3fb27SDimitry Andric   };
218*06c3fb27SDimitry Andric 
219*06c3fb27SDimitry Andric   auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool {
220*06c3fb27SDimitry Andric     return ProcessUNPCKToIntDomain(NewOpcIntDomain);
221*06c3fb27SDimitry Andric   };
222*06c3fb27SDimitry Andric 
223*06c3fb27SDimitry Andric   auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool {
224*06c3fb27SDimitry Andric     return ProcessUNPCKToIntDomain(NewOpc);
225*06c3fb27SDimitry Andric   };
226*06c3fb27SDimitry Andric 
227*06c3fb27SDimitry Andric   switch (Opc) {
228*06c3fb27SDimitry Andric   case X86::VPERMILPDri:
229*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDrri);
230*06c3fb27SDimitry Andric   case X86::VPERMILPDYri:
231*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDYrri);
232*06c3fb27SDimitry Andric   case X86::VPERMILPDZ128ri:
233*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZ128rri);
234*06c3fb27SDimitry Andric   case X86::VPERMILPDZ256ri:
235*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZ256rri);
236*06c3fb27SDimitry Andric   case X86::VPERMILPDZri:
237*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZrri);
238*06c3fb27SDimitry Andric   case X86::VPERMILPDZ128rikz:
239*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz);
240*06c3fb27SDimitry Andric   case X86::VPERMILPDZ256rikz:
241*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz);
242*06c3fb27SDimitry Andric   case X86::VPERMILPDZrikz:
243*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZrrikz);
244*06c3fb27SDimitry Andric   case X86::VPERMILPDZ128rik:
245*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik);
246*06c3fb27SDimitry Andric   case X86::VPERMILPDZ256rik:
247*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik);
248*06c3fb27SDimitry Andric   case X86::VPERMILPDZrik:
249*06c3fb27SDimitry Andric     return ProcessVPERMILPDri(X86::VSHUFPDZrrik);
250*06c3fb27SDimitry Andric 
251*06c3fb27SDimitry Andric   case X86::VPERMILPSri:
252*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSrri);
253*06c3fb27SDimitry Andric   case X86::VPERMILPSYri:
254*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSYrri);
255*06c3fb27SDimitry Andric   case X86::VPERMILPSZ128ri:
256*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZ128rri);
257*06c3fb27SDimitry Andric   case X86::VPERMILPSZ256ri:
258*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZ256rri);
259*06c3fb27SDimitry Andric   case X86::VPERMILPSZri:
260*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZrri);
261*06c3fb27SDimitry Andric   case X86::VPERMILPSZ128rikz:
262*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz);
263*06c3fb27SDimitry Andric   case X86::VPERMILPSZ256rikz:
264*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz);
265*06c3fb27SDimitry Andric   case X86::VPERMILPSZrikz:
266*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZrrikz);
267*06c3fb27SDimitry Andric   case X86::VPERMILPSZ128rik:
268*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik);
269*06c3fb27SDimitry Andric   case X86::VPERMILPSZ256rik:
270*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik);
271*06c3fb27SDimitry Andric   case X86::VPERMILPSZrik:
272*06c3fb27SDimitry Andric     return ProcessVPERMILPSri(X86::VSHUFPSZrrik);
273*06c3fb27SDimitry Andric   case X86::VPERMILPSmi:
274*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDmi);
275*06c3fb27SDimitry Andric   case X86::VPERMILPSYmi:
276*06c3fb27SDimitry Andric     // TODO: See if there is a more generic way we can test if the replacement
277*06c3fb27SDimitry Andric     // instruction is supported.
278*06c3fb27SDimitry Andric     return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false;
279*06c3fb27SDimitry Andric   case X86::VPERMILPSZ128mi:
280*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi);
281*06c3fb27SDimitry Andric   case X86::VPERMILPSZ256mi:
282*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi);
283*06c3fb27SDimitry Andric   case X86::VPERMILPSZmi:
284*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZmi);
285*06c3fb27SDimitry Andric   case X86::VPERMILPSZ128mikz:
286*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz);
287*06c3fb27SDimitry Andric   case X86::VPERMILPSZ256mikz:
288*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz);
289*06c3fb27SDimitry Andric   case X86::VPERMILPSZmikz:
290*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZmikz);
291*06c3fb27SDimitry Andric   case X86::VPERMILPSZ128mik:
292*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik);
293*06c3fb27SDimitry Andric   case X86::VPERMILPSZ256mik:
294*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik);
295*06c3fb27SDimitry Andric   case X86::VPERMILPSZmik:
296*06c3fb27SDimitry Andric     return ProcessVPERMILPSmi(X86::VPSHUFDZmik);
297*06c3fb27SDimitry Andric 
298*06c3fb27SDimitry Andric   case X86::MOVLHPSrr:
299*06c3fb27SDimitry Andric   case X86::UNPCKLPDrr:
300*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri);
301*06c3fb27SDimitry Andric   case X86::VMOVLHPSrr:
302*06c3fb27SDimitry Andric   case X86::VUNPCKLPDrr:
303*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri);
304*06c3fb27SDimitry Andric   case X86::VUNPCKLPDYrr:
305*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri);
306*06c3fb27SDimitry Andric     // VMOVLHPS is always 128 bits.
307*06c3fb27SDimitry Andric   case X86::VMOVLHPSZrr:
308*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ128rr:
309*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri);
310*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ256rr:
311*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri);
312*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZrr:
313*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri);
314*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ128rrk:
315*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik);
316*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ256rrk:
317*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik);
318*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZrrk:
319*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik);
320*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ128rrkz:
321*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
322*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ256rrkz:
323*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
324*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZrrkz:
325*06c3fb27SDimitry Andric     return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz);
326*06c3fb27SDimitry Andric   case X86::UNPCKHPDrr:
327*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri);
328*06c3fb27SDimitry Andric   case X86::VUNPCKHPDrr:
329*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri);
330*06c3fb27SDimitry Andric   case X86::VUNPCKHPDYrr:
331*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri);
332*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ128rr:
333*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri);
334*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ256rr:
335*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri);
336*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZrr:
337*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri);
338*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ128rrk:
339*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik);
340*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ256rrk:
341*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik);
342*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZrrk:
343*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik);
344*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ128rrkz:
345*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
346*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ256rrkz:
347*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
348*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZrrkz:
349*06c3fb27SDimitry Andric     return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz);
350*06c3fb27SDimitry Andric   case X86::UNPCKLPDrm:
351*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm);
352*06c3fb27SDimitry Andric   case X86::VUNPCKLPDrm:
353*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm);
354*06c3fb27SDimitry Andric   case X86::VUNPCKLPDYrm:
355*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm);
356*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ128rm:
357*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm);
358*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ256rm:
359*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm);
360*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZrm:
361*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm);
362*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ128rmk:
363*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk);
364*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ256rmk:
365*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk);
366*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZrmk:
367*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk);
368*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ128rmkz:
369*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz);
370*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZ256rmkz:
371*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz);
372*06c3fb27SDimitry Andric   case X86::VUNPCKLPDZrmkz:
373*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz);
374*06c3fb27SDimitry Andric   case X86::UNPCKHPDrm:
375*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm);
376*06c3fb27SDimitry Andric   case X86::VUNPCKHPDrm:
377*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm);
378*06c3fb27SDimitry Andric   case X86::VUNPCKHPDYrm:
379*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm);
380*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ128rm:
381*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm);
382*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ256rm:
383*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm);
384*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZrm:
385*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm);
386*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ128rmk:
387*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk);
388*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ256rmk:
389*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk);
390*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZrmk:
391*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk);
392*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ128rmkz:
393*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz);
394*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZ256rmkz:
395*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz);
396*06c3fb27SDimitry Andric   case X86::VUNPCKHPDZrmkz:
397*06c3fb27SDimitry Andric     return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz);
398*06c3fb27SDimitry Andric 
399*06c3fb27SDimitry Andric   case X86::UNPCKLPSrr:
400*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::PUNPCKLDQrr);
401*06c3fb27SDimitry Andric   case X86::VUNPCKLPSrr:
402*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQrr);
403*06c3fb27SDimitry Andric   case X86::VUNPCKLPSYrr:
404*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQYrr);
405*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ128rr:
406*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rr);
407*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ256rr:
408*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rr);
409*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZrr:
410*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZrr);
411*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ128rrk:
412*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrk);
413*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ256rrk:
414*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrk);
415*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZrrk:
416*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZrrk);
417*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ128rrkz:
418*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrkz);
419*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ256rrkz:
420*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrkz);
421*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZrrkz:
422*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZrrkz);
423*06c3fb27SDimitry Andric   case X86::UNPCKHPSrr:
424*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::PUNPCKHDQrr);
425*06c3fb27SDimitry Andric   case X86::VUNPCKHPSrr:
426*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQrr);
427*06c3fb27SDimitry Andric   case X86::VUNPCKHPSYrr:
428*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQYrr);
429*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ128rr:
430*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rr);
431*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ256rr:
432*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rr);
433*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZrr:
434*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZrr);
435*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ128rrk:
436*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrk);
437*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ256rrk:
438*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrk);
439*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZrrk:
440*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZrrk);
441*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ128rrkz:
442*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrkz);
443*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ256rrkz:
444*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrkz);
445*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZrrkz:
446*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZrrkz);
447*06c3fb27SDimitry Andric   case X86::UNPCKLPSrm:
448*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::PUNPCKLDQrm);
449*06c3fb27SDimitry Andric   case X86::VUNPCKLPSrm:
450*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQrm);
451*06c3fb27SDimitry Andric   case X86::VUNPCKLPSYrm:
452*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQYrm);
453*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ128rm:
454*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rm);
455*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ256rm:
456*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rm);
457*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZrm:
458*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZrm);
459*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ128rmk:
460*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmk);
461*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ256rmk:
462*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmk);
463*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZrmk:
464*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZrmk);
465*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ128rmkz:
466*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmkz);
467*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZ256rmkz:
468*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmkz);
469*06c3fb27SDimitry Andric   case X86::VUNPCKLPSZrmkz:
470*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKLDQZrmkz);
471*06c3fb27SDimitry Andric   case X86::UNPCKHPSrm:
472*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::PUNPCKHDQrm);
473*06c3fb27SDimitry Andric   case X86::VUNPCKHPSrm:
474*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQrm);
475*06c3fb27SDimitry Andric   case X86::VUNPCKHPSYrm:
476*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQYrm);
477*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ128rm:
478*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rm);
479*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ256rm:
480*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rm);
481*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZrm:
482*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZrm);
483*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ128rmk:
484*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmk);
485*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ256rmk:
486*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmk);
487*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZrmk:
488*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZrmk);
489*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ128rmkz:
490*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmkz);
491*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZ256rmkz:
492*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
493*06c3fb27SDimitry Andric   case X86::VUNPCKHPSZrmkz:
494*06c3fb27SDimitry Andric     return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
495*06c3fb27SDimitry Andric   default:
496*06c3fb27SDimitry Andric     return false;
497*06c3fb27SDimitry Andric   }
498*06c3fb27SDimitry Andric }
499*06c3fb27SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)500*06c3fb27SDimitry Andric bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
501*06c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";);
502*06c3fb27SDimitry Andric   bool Changed = false;
503*06c3fb27SDimitry Andric   ST = &MF.getSubtarget<X86Subtarget>();
504*06c3fb27SDimitry Andric   TII = ST->getInstrInfo();
505*06c3fb27SDimitry Andric   SM = &ST->getSchedModel();
506*06c3fb27SDimitry Andric 
507*06c3fb27SDimitry Andric   for (MachineBasicBlock &MBB : MF) {
508*06c3fb27SDimitry Andric     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
509*06c3fb27SDimitry Andric       if (processInstruction(MF, MBB, I)) {
510*06c3fb27SDimitry Andric         ++NumInstChanges;
511*06c3fb27SDimitry Andric         Changed = true;
512*06c3fb27SDimitry Andric       }
513*06c3fb27SDimitry Andric     }
514*06c3fb27SDimitry Andric   }
515*06c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";);
516*06c3fb27SDimitry Andric   return Changed;
517*06c3fb27SDimitry Andric }
518