xref: /llvm-project/llvm/lib/Target/X86/X86CompressEVEX.cpp (revision 93c8468c6cd154efb8fae16a4025e116be8181c7)
1 //===- X86CompressEVEX.cpp ------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
10 // when possible in order to reduce code size or facilitate HW decoding.
11 //
12 // Possible compression:
13 //   a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
14 //   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
15 //   c. NDD (EVEX) -> non-NDD (legacy)
16 //   d. NF_ND (EVEX) -> NF (EVEX)
17 //
18 // Compression a, b and c can always reduce code size, with some exceptions
19 // such as promoted 16-bit CRC32 which is as long as the legacy version.
20 //
21 // legacy:
22 //   crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
23 // promoted:
24 //   crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
25 //
26 // From performance perspective, these should be same (same uops and same EXE
27 // ports). From a FMV perspective, an older legacy encoding is preferred b/c it
28 // can execute in more places (broader HW install base). So we will still do
29 // the compression.
30 //
31 // Compression d can help hardware decode (HW may skip reading the NDD
32 // register) although the instruction length remains unchanged.
33 //===----------------------------------------------------------------------===//
34 
35 #include "MCTargetDesc/X86BaseInfo.h"
36 #include "MCTargetDesc/X86InstComments.h"
37 #include "X86.h"
38 #include "X86InstrInfo.h"
39 #include "X86Subtarget.h"
40 #include "llvm/ADT/StringRef.h"
41 #include "llvm/CodeGen/MachineFunction.h"
42 #include "llvm/CodeGen/MachineFunctionPass.h"
43 #include "llvm/CodeGen/MachineInstr.h"
44 #include "llvm/CodeGen/MachineOperand.h"
45 #include "llvm/MC/MCInstrDesc.h"
46 #include "llvm/Pass.h"
47 #include <atomic>
48 #include <cassert>
49 #include <cstdint>
50 
51 using namespace llvm;
52 
53 // Including the generated EVEX compression tables.
54 struct X86CompressEVEXTableEntry {
55   uint16_t OldOpc;
56   uint16_t NewOpc;
57 
58   bool operator<(const X86CompressEVEXTableEntry &RHS) const {
59     return OldOpc < RHS.OldOpc;
60   }
61 
62   friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) {
63     return TE.OldOpc < Opc;
64   }
65 };
66 #include "X86GenCompressEVEXTables.inc"
67 
68 #define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
69 #define COMP_EVEX_NAME "x86-compress-evex"
70 
71 #define DEBUG_TYPE COMP_EVEX_NAME
72 
73 namespace {
74 
75 class CompressEVEXPass : public MachineFunctionPass {
76 public:
77   static char ID;
78   CompressEVEXPass() : MachineFunctionPass(ID) {}
79   StringRef getPassName() const override { return COMP_EVEX_DESC; }
80 
81   bool runOnMachineFunction(MachineFunction &MF) override;
82 
83   // This pass runs after regalloc and doesn't support VReg operands.
84   MachineFunctionProperties getRequiredProperties() const override {
85     return MachineFunctionProperties().set(
86         MachineFunctionProperties::Property::NoVRegs);
87   }
88 };
89 
90 } // end anonymous namespace
91 
92 char CompressEVEXPass::ID = 0;
93 
94 static bool usesExtendedRegister(const MachineInstr &MI) {
95   auto isHiRegIdx = [](unsigned Reg) {
96     // Check for XMM register with indexes between 16 - 31.
97     if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
98       return true;
99     // Check for YMM register with indexes between 16 - 31.
100     if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
101       return true;
102     // Check for GPR with indexes between 16 - 31.
103     if (X86II::isApxExtendedReg(Reg))
104       return true;
105     return false;
106   };
107 
108   // Check that operands are not ZMM regs or
109   // XMM/YMM regs with hi indexes between 16 - 31.
110   for (const MachineOperand &MO : MI.explicit_operands()) {
111     if (!MO.isReg())
112       continue;
113 
114     Register Reg = MO.getReg();
115     assert(!X86II::isZMMReg(Reg) &&
116            "ZMM instructions should not be in the EVEX->VEX tables");
117     if (isHiRegIdx(Reg))
118       return true;
119   }
120 
121   return false;
122 }
123 
124 static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) {
125   switch (OldOpc) {
126   default:
127     return true;
128   case X86::VCVTNEPS2BF16Z128rm:
129   case X86::VCVTNEPS2BF16Z128rr:
130   case X86::VCVTNEPS2BF16Z256rm:
131   case X86::VCVTNEPS2BF16Z256rr:
132     return ST.hasAVXNECONVERT();
133   case X86::VPDPBUSDSZ128m:
134   case X86::VPDPBUSDSZ128r:
135   case X86::VPDPBUSDSZ256m:
136   case X86::VPDPBUSDSZ256r:
137   case X86::VPDPBUSDZ128m:
138   case X86::VPDPBUSDZ128r:
139   case X86::VPDPBUSDZ256m:
140   case X86::VPDPBUSDZ256r:
141   case X86::VPDPWSSDSZ128m:
142   case X86::VPDPWSSDSZ128r:
143   case X86::VPDPWSSDSZ256m:
144   case X86::VPDPWSSDSZ256r:
145   case X86::VPDPWSSDZ128m:
146   case X86::VPDPWSSDZ128r:
147   case X86::VPDPWSSDZ256m:
148   case X86::VPDPWSSDZ256r:
149     return ST.hasAVXVNNI();
150   case X86::VPMADD52HUQZ128m:
151   case X86::VPMADD52HUQZ128r:
152   case X86::VPMADD52HUQZ256m:
153   case X86::VPMADD52HUQZ256r:
154   case X86::VPMADD52LUQZ128m:
155   case X86::VPMADD52LUQZ128r:
156   case X86::VPMADD52LUQZ256m:
157   case X86::VPMADD52LUQZ256r:
158     return ST.hasAVXIFMA();
159   }
160 }
161 
162 // Do any custom cleanup needed to finalize the conversion.
163 static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
164   (void)NewOpc;
165   unsigned Opc = MI.getOpcode();
166   switch (Opc) {
167   case X86::VALIGNDZ128rri:
168   case X86::VALIGNDZ128rmi:
169   case X86::VALIGNQZ128rri:
170   case X86::VALIGNQZ128rmi: {
171     assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
172            "Unexpected new opcode!");
173     unsigned Scale =
174         (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
175     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
176     Imm.setImm(Imm.getImm() * Scale);
177     break;
178   }
179   case X86::VSHUFF32X4Z256rmi:
180   case X86::VSHUFF32X4Z256rri:
181   case X86::VSHUFF64X2Z256rmi:
182   case X86::VSHUFF64X2Z256rri:
183   case X86::VSHUFI32X4Z256rmi:
184   case X86::VSHUFI32X4Z256rri:
185   case X86::VSHUFI64X2Z256rmi:
186   case X86::VSHUFI64X2Z256rri: {
187     assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
188             NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
189            "Unexpected new opcode!");
190     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
191     int64_t ImmVal = Imm.getImm();
192     // Set bit 5, move bit 1 to bit 4, copy bit 0.
193     Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
194     break;
195   }
196   case X86::VRNDSCALEPDZ128rri:
197   case X86::VRNDSCALEPDZ128rmi:
198   case X86::VRNDSCALEPSZ128rri:
199   case X86::VRNDSCALEPSZ128rmi:
200   case X86::VRNDSCALEPDZ256rri:
201   case X86::VRNDSCALEPDZ256rmi:
202   case X86::VRNDSCALEPSZ256rri:
203   case X86::VRNDSCALEPSZ256rmi:
204   case X86::VRNDSCALESDZr:
205   case X86::VRNDSCALESDZm:
206   case X86::VRNDSCALESSZr:
207   case X86::VRNDSCALESSZm:
208   case X86::VRNDSCALESDZr_Int:
209   case X86::VRNDSCALESDZm_Int:
210   case X86::VRNDSCALESSZr_Int:
211   case X86::VRNDSCALESSZm_Int:
212     const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
213     int64_t ImmVal = Imm.getImm();
214     // Ensure that only bits 3:0 of the immediate are used.
215     if ((ImmVal & 0xf) != ImmVal)
216       return false;
217     break;
218   }
219 
220   return true;
221 }
222 
223 static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
224   const MCInstrDesc &Desc = MI.getDesc();
225 
226   // Check for EVEX instructions only.
227   if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX)
228     return false;
229 
230   // Check for EVEX instructions with mask or broadcast as in these cases
231   // the EVEX prefix is needed in order to carry this information
232   // thus preventing the transformation to VEX encoding.
233   if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
234     return false;
235 
236   // Check for EVEX instructions with L2 set. These instructions are 512-bits
237   // and can't be converted to VEX.
238   if (Desc.TSFlags & X86II::EVEX_L2)
239     return false;
240 
241   ArrayRef<X86CompressEVEXTableEntry> Table = ArrayRef(X86CompressEVEXTable);
242 
243   unsigned Opc = MI.getOpcode();
244   const auto *I = llvm::lower_bound(Table, Opc);
245   if (I == Table.end() || I->OldOpc != Opc)
246     return false;
247 
248   if (usesExtendedRegister(MI))
249     return false;
250   if (!checkVEXInstPredicate(Opc, ST))
251     return false;
252   if (!performCustomAdjustments(MI, I->NewOpc))
253     return false;
254 
255   MI.setDesc(ST.getInstrInfo()->get(I->NewOpc));
256   MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
257   return true;
258 }
259 
260 bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
261 #ifndef NDEBUG
262   // Make sure the tables are sorted.
263   static std::atomic<bool> TableChecked(false);
264   if (!TableChecked.load(std::memory_order_relaxed)) {
265     assert(llvm::is_sorted(X86CompressEVEXTable) &&
266            "X86CompressEVEXTable is not sorted!");
267     TableChecked.store(true, std::memory_order_relaxed);
268   }
269 #endif
270   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
271   if (!ST.hasAVX512() && !ST.hasEGPR())
272     return false;
273 
274   bool Changed = false;
275 
276   for (MachineBasicBlock &MBB : MF) {
277     // Traverse the basic block.
278     for (MachineInstr &MI : MBB)
279       Changed |= CompressEVEXImpl(MI, ST);
280   }
281 
282   return Changed;
283 }
284 
285 INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false)
286 
287 FunctionPass *llvm::createX86CompressEVEXPass() {
288   return new CompressEVEXPass();
289 }
290