xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/X86/X86CompressEVEX.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1*1db9f3b2SDimitry Andric //===- X86CompressEVEX.cpp ------------------------------------------------===//
2*1db9f3b2SDimitry Andric //
3*1db9f3b2SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*1db9f3b2SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*1db9f3b2SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*1db9f3b2SDimitry Andric //
7*1db9f3b2SDimitry Andric //===----------------------------------------------------------------------===//
8*1db9f3b2SDimitry Andric //
9*1db9f3b2SDimitry Andric // This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
10*1db9f3b2SDimitry Andric // when possible in order to reduce code size or facilitate HW decoding.
11*1db9f3b2SDimitry Andric //
12*1db9f3b2SDimitry Andric // Possible compression:
13*1db9f3b2SDimitry Andric //   a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
14*1db9f3b2SDimitry Andric //   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
15*1db9f3b2SDimitry Andric //   c. NDD (EVEX) -> non-NDD (legacy)
16*1db9f3b2SDimitry Andric //   d. NF_ND (EVEX) -> NF (EVEX)
17*1db9f3b2SDimitry Andric //
18*1db9f3b2SDimitry Andric // Compression a, b and c can always reduce code size, with some exceptions
19*1db9f3b2SDimitry Andric // such as promoted 16-bit CRC32 which is as long as the legacy version.
20*1db9f3b2SDimitry Andric //
21*1db9f3b2SDimitry Andric // legacy:
22*1db9f3b2SDimitry Andric //   crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
23*1db9f3b2SDimitry Andric // promoted:
24*1db9f3b2SDimitry Andric //   crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
25*1db9f3b2SDimitry Andric //
26*1db9f3b2SDimitry Andric // From performance perspective, these should be same (same uops and same EXE
27*1db9f3b2SDimitry Andric // ports). From a FMV perspective, an older legacy encoding is preferred b/c it
28*1db9f3b2SDimitry Andric // can execute in more places (broader HW install base). So we will still do
29*1db9f3b2SDimitry Andric // the compression.
30*1db9f3b2SDimitry Andric //
31*1db9f3b2SDimitry Andric // Compression d can help hardware decode (HW may skip reading the NDD
32*1db9f3b2SDimitry Andric // register) although the instruction length remains unchanged.
33*1db9f3b2SDimitry Andric //===----------------------------------------------------------------------===//
34*1db9f3b2SDimitry Andric 
35*1db9f3b2SDimitry Andric #include "MCTargetDesc/X86BaseInfo.h"
36*1db9f3b2SDimitry Andric #include "MCTargetDesc/X86InstComments.h"
37*1db9f3b2SDimitry Andric #include "X86.h"
38*1db9f3b2SDimitry Andric #include "X86InstrInfo.h"
39*1db9f3b2SDimitry Andric #include "X86Subtarget.h"
40*1db9f3b2SDimitry Andric #include "llvm/ADT/StringRef.h"
41*1db9f3b2SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
42*1db9f3b2SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
43*1db9f3b2SDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
44*1db9f3b2SDimitry Andric #include "llvm/CodeGen/MachineOperand.h"
45*1db9f3b2SDimitry Andric #include "llvm/MC/MCInstrDesc.h"
46*1db9f3b2SDimitry Andric #include "llvm/Pass.h"
47*1db9f3b2SDimitry Andric #include <atomic>
48*1db9f3b2SDimitry Andric #include <cassert>
49*1db9f3b2SDimitry Andric #include <cstdint>
50*1db9f3b2SDimitry Andric 
51*1db9f3b2SDimitry Andric using namespace llvm;
52*1db9f3b2SDimitry Andric 
53*1db9f3b2SDimitry Andric // Including the generated EVEX compression tables.
54*1db9f3b2SDimitry Andric struct X86CompressEVEXTableEntry {
55*1db9f3b2SDimitry Andric   uint16_t OldOpc;
56*1db9f3b2SDimitry Andric   uint16_t NewOpc;
57*1db9f3b2SDimitry Andric 
58*1db9f3b2SDimitry Andric   bool operator<(const X86CompressEVEXTableEntry &RHS) const {
59*1db9f3b2SDimitry Andric     return OldOpc < RHS.OldOpc;
60*1db9f3b2SDimitry Andric   }
61*1db9f3b2SDimitry Andric 
62*1db9f3b2SDimitry Andric   friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) {
63*1db9f3b2SDimitry Andric     return TE.OldOpc < Opc;
64*1db9f3b2SDimitry Andric   }
65*1db9f3b2SDimitry Andric };
66*1db9f3b2SDimitry Andric #include "X86GenCompressEVEXTables.inc"
67*1db9f3b2SDimitry Andric 
68*1db9f3b2SDimitry Andric #define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
69*1db9f3b2SDimitry Andric #define COMP_EVEX_NAME "x86-compress-evex"
70*1db9f3b2SDimitry Andric 
71*1db9f3b2SDimitry Andric #define DEBUG_TYPE COMP_EVEX_NAME
72*1db9f3b2SDimitry Andric 
73*1db9f3b2SDimitry Andric namespace {
74*1db9f3b2SDimitry Andric 
75*1db9f3b2SDimitry Andric class CompressEVEXPass : public MachineFunctionPass {
76*1db9f3b2SDimitry Andric public:
77*1db9f3b2SDimitry Andric   static char ID;
78*1db9f3b2SDimitry Andric   CompressEVEXPass() : MachineFunctionPass(ID) {}
79*1db9f3b2SDimitry Andric   StringRef getPassName() const override { return COMP_EVEX_DESC; }
80*1db9f3b2SDimitry Andric 
81*1db9f3b2SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
82*1db9f3b2SDimitry Andric 
83*1db9f3b2SDimitry Andric   // This pass runs after regalloc and doesn't support VReg operands.
84*1db9f3b2SDimitry Andric   MachineFunctionProperties getRequiredProperties() const override {
85*1db9f3b2SDimitry Andric     return MachineFunctionProperties().set(
86*1db9f3b2SDimitry Andric         MachineFunctionProperties::Property::NoVRegs);
87*1db9f3b2SDimitry Andric   }
88*1db9f3b2SDimitry Andric };
89*1db9f3b2SDimitry Andric 
90*1db9f3b2SDimitry Andric } // end anonymous namespace
91*1db9f3b2SDimitry Andric 
92*1db9f3b2SDimitry Andric char CompressEVEXPass::ID = 0;
93*1db9f3b2SDimitry Andric 
94*1db9f3b2SDimitry Andric static bool usesExtendedRegister(const MachineInstr &MI) {
95*1db9f3b2SDimitry Andric   auto isHiRegIdx = [](unsigned Reg) {
96*1db9f3b2SDimitry Andric     // Check for XMM register with indexes between 16 - 31.
97*1db9f3b2SDimitry Andric     if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
98*1db9f3b2SDimitry Andric       return true;
99*1db9f3b2SDimitry Andric     // Check for YMM register with indexes between 16 - 31.
100*1db9f3b2SDimitry Andric     if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
101*1db9f3b2SDimitry Andric       return true;
102*1db9f3b2SDimitry Andric     // Check for GPR with indexes between 16 - 31.
103*1db9f3b2SDimitry Andric     if (X86II::isApxExtendedReg(Reg))
104*1db9f3b2SDimitry Andric       return true;
105*1db9f3b2SDimitry Andric     return false;
106*1db9f3b2SDimitry Andric   };
107*1db9f3b2SDimitry Andric 
108*1db9f3b2SDimitry Andric   // Check that operands are not ZMM regs or
109*1db9f3b2SDimitry Andric   // XMM/YMM regs with hi indexes between 16 - 31.
110*1db9f3b2SDimitry Andric   for (const MachineOperand &MO : MI.explicit_operands()) {
111*1db9f3b2SDimitry Andric     if (!MO.isReg())
112*1db9f3b2SDimitry Andric       continue;
113*1db9f3b2SDimitry Andric 
114*1db9f3b2SDimitry Andric     Register Reg = MO.getReg();
115*1db9f3b2SDimitry Andric     assert(!X86II::isZMMReg(Reg) &&
116*1db9f3b2SDimitry Andric            "ZMM instructions should not be in the EVEX->VEX tables");
117*1db9f3b2SDimitry Andric     if (isHiRegIdx(Reg))
118*1db9f3b2SDimitry Andric       return true;
119*1db9f3b2SDimitry Andric   }
120*1db9f3b2SDimitry Andric 
121*1db9f3b2SDimitry Andric   return false;
122*1db9f3b2SDimitry Andric }
123*1db9f3b2SDimitry Andric 
124*1db9f3b2SDimitry Andric static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) {
125*1db9f3b2SDimitry Andric   switch (OldOpc) {
126*1db9f3b2SDimitry Andric   default:
127*1db9f3b2SDimitry Andric     return true;
128*1db9f3b2SDimitry Andric   case X86::VCVTNEPS2BF16Z128rm:
129*1db9f3b2SDimitry Andric   case X86::VCVTNEPS2BF16Z128rr:
130*1db9f3b2SDimitry Andric   case X86::VCVTNEPS2BF16Z256rm:
131*1db9f3b2SDimitry Andric   case X86::VCVTNEPS2BF16Z256rr:
132*1db9f3b2SDimitry Andric     return ST.hasAVXNECONVERT();
133*1db9f3b2SDimitry Andric   case X86::VPDPBUSDSZ128m:
134*1db9f3b2SDimitry Andric   case X86::VPDPBUSDSZ128r:
135*1db9f3b2SDimitry Andric   case X86::VPDPBUSDSZ256m:
136*1db9f3b2SDimitry Andric   case X86::VPDPBUSDSZ256r:
137*1db9f3b2SDimitry Andric   case X86::VPDPBUSDZ128m:
138*1db9f3b2SDimitry Andric   case X86::VPDPBUSDZ128r:
139*1db9f3b2SDimitry Andric   case X86::VPDPBUSDZ256m:
140*1db9f3b2SDimitry Andric   case X86::VPDPBUSDZ256r:
141*1db9f3b2SDimitry Andric   case X86::VPDPWSSDSZ128m:
142*1db9f3b2SDimitry Andric   case X86::VPDPWSSDSZ128r:
143*1db9f3b2SDimitry Andric   case X86::VPDPWSSDSZ256m:
144*1db9f3b2SDimitry Andric   case X86::VPDPWSSDSZ256r:
145*1db9f3b2SDimitry Andric   case X86::VPDPWSSDZ128m:
146*1db9f3b2SDimitry Andric   case X86::VPDPWSSDZ128r:
147*1db9f3b2SDimitry Andric   case X86::VPDPWSSDZ256m:
148*1db9f3b2SDimitry Andric   case X86::VPDPWSSDZ256r:
149*1db9f3b2SDimitry Andric     return ST.hasAVXVNNI();
150*1db9f3b2SDimitry Andric   case X86::VPMADD52HUQZ128m:
151*1db9f3b2SDimitry Andric   case X86::VPMADD52HUQZ128r:
152*1db9f3b2SDimitry Andric   case X86::VPMADD52HUQZ256m:
153*1db9f3b2SDimitry Andric   case X86::VPMADD52HUQZ256r:
154*1db9f3b2SDimitry Andric   case X86::VPMADD52LUQZ128m:
155*1db9f3b2SDimitry Andric   case X86::VPMADD52LUQZ128r:
156*1db9f3b2SDimitry Andric   case X86::VPMADD52LUQZ256m:
157*1db9f3b2SDimitry Andric   case X86::VPMADD52LUQZ256r:
158*1db9f3b2SDimitry Andric     return ST.hasAVXIFMA();
159*1db9f3b2SDimitry Andric   }
160*1db9f3b2SDimitry Andric }
161*1db9f3b2SDimitry Andric 
162*1db9f3b2SDimitry Andric // Do any custom cleanup needed to finalize the conversion.
163*1db9f3b2SDimitry Andric static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
164*1db9f3b2SDimitry Andric   (void)NewOpc;
165*1db9f3b2SDimitry Andric   unsigned Opc = MI.getOpcode();
166*1db9f3b2SDimitry Andric   switch (Opc) {
167*1db9f3b2SDimitry Andric   case X86::VALIGNDZ128rri:
168*1db9f3b2SDimitry Andric   case X86::VALIGNDZ128rmi:
169*1db9f3b2SDimitry Andric   case X86::VALIGNQZ128rri:
170*1db9f3b2SDimitry Andric   case X86::VALIGNQZ128rmi: {
171*1db9f3b2SDimitry Andric     assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
172*1db9f3b2SDimitry Andric            "Unexpected new opcode!");
173*1db9f3b2SDimitry Andric     unsigned Scale =
174*1db9f3b2SDimitry Andric         (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
175*1db9f3b2SDimitry Andric     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
176*1db9f3b2SDimitry Andric     Imm.setImm(Imm.getImm() * Scale);
177*1db9f3b2SDimitry Andric     break;
178*1db9f3b2SDimitry Andric   }
179*1db9f3b2SDimitry Andric   case X86::VSHUFF32X4Z256rmi:
180*1db9f3b2SDimitry Andric   case X86::VSHUFF32X4Z256rri:
181*1db9f3b2SDimitry Andric   case X86::VSHUFF64X2Z256rmi:
182*1db9f3b2SDimitry Andric   case X86::VSHUFF64X2Z256rri:
183*1db9f3b2SDimitry Andric   case X86::VSHUFI32X4Z256rmi:
184*1db9f3b2SDimitry Andric   case X86::VSHUFI32X4Z256rri:
185*1db9f3b2SDimitry Andric   case X86::VSHUFI64X2Z256rmi:
186*1db9f3b2SDimitry Andric   case X86::VSHUFI64X2Z256rri: {
187*1db9f3b2SDimitry Andric     assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
188*1db9f3b2SDimitry Andric             NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
189*1db9f3b2SDimitry Andric            "Unexpected new opcode!");
190*1db9f3b2SDimitry Andric     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
191*1db9f3b2SDimitry Andric     int64_t ImmVal = Imm.getImm();
192*1db9f3b2SDimitry Andric     // Set bit 5, move bit 1 to bit 4, copy bit 0.
193*1db9f3b2SDimitry Andric     Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
194*1db9f3b2SDimitry Andric     break;
195*1db9f3b2SDimitry Andric   }
196*1db9f3b2SDimitry Andric   case X86::VRNDSCALEPDZ128rri:
197*1db9f3b2SDimitry Andric   case X86::VRNDSCALEPDZ128rmi:
198*1db9f3b2SDimitry Andric   case X86::VRNDSCALEPSZ128rri:
199*1db9f3b2SDimitry Andric   case X86::VRNDSCALEPSZ128rmi:
200*1db9f3b2SDimitry Andric   case X86::VRNDSCALEPDZ256rri:
201*1db9f3b2SDimitry Andric   case X86::VRNDSCALEPDZ256rmi:
202*1db9f3b2SDimitry Andric   case X86::VRNDSCALEPSZ256rri:
203*1db9f3b2SDimitry Andric   case X86::VRNDSCALEPSZ256rmi:
204*1db9f3b2SDimitry Andric   case X86::VRNDSCALESDZr:
205*1db9f3b2SDimitry Andric   case X86::VRNDSCALESDZm:
206*1db9f3b2SDimitry Andric   case X86::VRNDSCALESSZr:
207*1db9f3b2SDimitry Andric   case X86::VRNDSCALESSZm:
208*1db9f3b2SDimitry Andric   case X86::VRNDSCALESDZr_Int:
209*1db9f3b2SDimitry Andric   case X86::VRNDSCALESDZm_Int:
210*1db9f3b2SDimitry Andric   case X86::VRNDSCALESSZr_Int:
211*1db9f3b2SDimitry Andric   case X86::VRNDSCALESSZm_Int:
212*1db9f3b2SDimitry Andric     const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
213*1db9f3b2SDimitry Andric     int64_t ImmVal = Imm.getImm();
214*1db9f3b2SDimitry Andric     // Ensure that only bits 3:0 of the immediate are used.
215*1db9f3b2SDimitry Andric     if ((ImmVal & 0xf) != ImmVal)
216*1db9f3b2SDimitry Andric       return false;
217*1db9f3b2SDimitry Andric     break;
218*1db9f3b2SDimitry Andric   }
219*1db9f3b2SDimitry Andric 
220*1db9f3b2SDimitry Andric   return true;
221*1db9f3b2SDimitry Andric }
222*1db9f3b2SDimitry Andric 
223*1db9f3b2SDimitry Andric static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
224*1db9f3b2SDimitry Andric   uint64_t TSFlags = MI.getDesc().TSFlags;
225*1db9f3b2SDimitry Andric 
226*1db9f3b2SDimitry Andric   // Check for EVEX instructions only.
227*1db9f3b2SDimitry Andric   if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
228*1db9f3b2SDimitry Andric     return false;
229*1db9f3b2SDimitry Andric 
230*1db9f3b2SDimitry Andric   // Instructions with mask or 512-bit vector can't be converted to VEX.
231*1db9f3b2SDimitry Andric   if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
232*1db9f3b2SDimitry Andric     return false;
233*1db9f3b2SDimitry Andric 
234*1db9f3b2SDimitry Andric   // EVEX_B has several meanings.
235*1db9f3b2SDimitry Andric   // AVX512:
236*1db9f3b2SDimitry Andric   //  register form: rounding control or SAE
237*1db9f3b2SDimitry Andric   //  memory form: broadcast
238*1db9f3b2SDimitry Andric   //
239*1db9f3b2SDimitry Andric   // APX:
240*1db9f3b2SDimitry Andric   //  MAP4: NDD
241*1db9f3b2SDimitry Andric   //
242*1db9f3b2SDimitry Andric   // For AVX512 cases, EVEX prefix is needed in order to carry this information
243*1db9f3b2SDimitry Andric   // thus preventing the transformation to VEX encoding.
244*1db9f3b2SDimitry Andric   if (TSFlags & X86II::EVEX_B)
245*1db9f3b2SDimitry Andric     return false;
246*1db9f3b2SDimitry Andric 
247*1db9f3b2SDimitry Andric   ArrayRef<X86CompressEVEXTableEntry> Table = ArrayRef(X86CompressEVEXTable);
248*1db9f3b2SDimitry Andric 
249*1db9f3b2SDimitry Andric   unsigned Opc = MI.getOpcode();
250*1db9f3b2SDimitry Andric   const auto *I = llvm::lower_bound(Table, Opc);
251*1db9f3b2SDimitry Andric   if (I == Table.end() || I->OldOpc != Opc)
252*1db9f3b2SDimitry Andric     return false;
253*1db9f3b2SDimitry Andric 
254*1db9f3b2SDimitry Andric   if (usesExtendedRegister(MI) || !checkVEXInstPredicate(Opc, ST) ||
255*1db9f3b2SDimitry Andric       !performCustomAdjustments(MI, I->NewOpc))
256*1db9f3b2SDimitry Andric     return false;
257*1db9f3b2SDimitry Andric 
258*1db9f3b2SDimitry Andric   const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc);
259*1db9f3b2SDimitry Andric   MI.setDesc(NewDesc);
260*1db9f3b2SDimitry Andric   uint64_t Encoding = NewDesc.TSFlags & X86II::EncodingMask;
261*1db9f3b2SDimitry Andric   auto AsmComment =
262*1db9f3b2SDimitry Andric       (Encoding == X86II::VEX) ? X86::AC_EVEX_2_VEX : X86::AC_EVEX_2_LEGACY;
263*1db9f3b2SDimitry Andric   MI.setAsmPrinterFlag(AsmComment);
264*1db9f3b2SDimitry Andric   return true;
265*1db9f3b2SDimitry Andric }
266*1db9f3b2SDimitry Andric 
267*1db9f3b2SDimitry Andric bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
268*1db9f3b2SDimitry Andric #ifndef NDEBUG
269*1db9f3b2SDimitry Andric   // Make sure the tables are sorted.
270*1db9f3b2SDimitry Andric   static std::atomic<bool> TableChecked(false);
271*1db9f3b2SDimitry Andric   if (!TableChecked.load(std::memory_order_relaxed)) {
272*1db9f3b2SDimitry Andric     assert(llvm::is_sorted(X86CompressEVEXTable) &&
273*1db9f3b2SDimitry Andric            "X86CompressEVEXTable is not sorted!");
274*1db9f3b2SDimitry Andric     TableChecked.store(true, std::memory_order_relaxed);
275*1db9f3b2SDimitry Andric   }
276*1db9f3b2SDimitry Andric #endif
277*1db9f3b2SDimitry Andric   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
278*1db9f3b2SDimitry Andric   if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD())
279*1db9f3b2SDimitry Andric     return false;
280*1db9f3b2SDimitry Andric 
281*1db9f3b2SDimitry Andric   bool Changed = false;
282*1db9f3b2SDimitry Andric 
283*1db9f3b2SDimitry Andric   for (MachineBasicBlock &MBB : MF) {
284*1db9f3b2SDimitry Andric     // Traverse the basic block.
285*1db9f3b2SDimitry Andric     for (MachineInstr &MI : MBB)
286*1db9f3b2SDimitry Andric       Changed |= CompressEVEXImpl(MI, ST);
287*1db9f3b2SDimitry Andric   }
288*1db9f3b2SDimitry Andric 
289*1db9f3b2SDimitry Andric   return Changed;
290*1db9f3b2SDimitry Andric }
291*1db9f3b2SDimitry Andric 
292*1db9f3b2SDimitry Andric INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false)
293*1db9f3b2SDimitry Andric 
294*1db9f3b2SDimitry Andric FunctionPass *llvm::createX86CompressEVEXPass() {
295*1db9f3b2SDimitry Andric   return new CompressEVEXPass();
296*1db9f3b2SDimitry Andric }
297