xref: /llvm-project/llvm/lib/Target/X86/X86CompressEVEX.cpp (revision 29f11f0a3240dff1e10ed3d4a5412ecb8c762327)
1a5902a4dSShengchen Kan //===- X86CompressEVEX.cpp ------------------------------------------------===//
2a5902a4dSShengchen Kan //
3a5902a4dSShengchen Kan // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4a5902a4dSShengchen Kan // See https://llvm.org/LICENSE.txt for license information.
5a5902a4dSShengchen Kan // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6a5902a4dSShengchen Kan //
7a5902a4dSShengchen Kan //===----------------------------------------------------------------------===//
8a5902a4dSShengchen Kan //
9a5902a4dSShengchen Kan // This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
10a5902a4dSShengchen Kan // when possible in order to reduce code size or facilitate HW decoding.
11a5902a4dSShengchen Kan //
12a5902a4dSShengchen Kan // Possible compression:
13a5902a4dSShengchen Kan //   a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
1461bb3d49SShengchen Kan //   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
15a5902a4dSShengchen Kan //   c. NDD (EVEX) -> non-NDD (legacy)
16a5902a4dSShengchen Kan //   d. NF_ND (EVEX) -> NF (EVEX)
17a9e8a3a1SShengchen Kan //   e. NonNF (EVEX) -> NF (EVEX)
18a5902a4dSShengchen Kan //
19a5902a4dSShengchen Kan // Compression a, b and c can always reduce code size, with some exceptions
20a5902a4dSShengchen Kan // such as promoted 16-bit CRC32 which is as long as the legacy version.
21a5902a4dSShengchen Kan //
22a5902a4dSShengchen Kan // legacy:
23a5902a4dSShengchen Kan //   crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
24a5902a4dSShengchen Kan // promoted:
25a5902a4dSShengchen Kan //   crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
26a5902a4dSShengchen Kan //
27a5902a4dSShengchen Kan // From performance perspective, these should be same (same uops and same EXE
28a5902a4dSShengchen Kan // ports). From a FMV perspective, an older legacy encoding is preferred b/c it
29a5902a4dSShengchen Kan // can execute in more places (broader HW install base). So we will still do
30a5902a4dSShengchen Kan // the compression.
31a5902a4dSShengchen Kan //
32a5902a4dSShengchen Kan // Compression d can help hardware decode (HW may skip reading the NDD
33a5902a4dSShengchen Kan // register) although the instruction length remains unchanged.
34a9e8a3a1SShengchen Kan //
35a9e8a3a1SShengchen Kan // Compression e can help hardware skip updating EFLAGS although the instruction
36a9e8a3a1SShengchen Kan // length remains unchanged.
37a5902a4dSShengchen Kan //===----------------------------------------------------------------------===//
38a5902a4dSShengchen Kan 
39a5902a4dSShengchen Kan #include "MCTargetDesc/X86BaseInfo.h"
40a5902a4dSShengchen Kan #include "X86.h"
41a5902a4dSShengchen Kan #include "X86InstrInfo.h"
42a5902a4dSShengchen Kan #include "X86Subtarget.h"
43a5902a4dSShengchen Kan #include "llvm/ADT/StringRef.h"
44a5902a4dSShengchen Kan #include "llvm/CodeGen/MachineFunction.h"
45a5902a4dSShengchen Kan #include "llvm/CodeGen/MachineFunctionPass.h"
46a5902a4dSShengchen Kan #include "llvm/CodeGen/MachineInstr.h"
47a5902a4dSShengchen Kan #include "llvm/CodeGen/MachineOperand.h"
48a5902a4dSShengchen Kan #include "llvm/MC/MCInstrDesc.h"
49a5902a4dSShengchen Kan #include "llvm/Pass.h"
50a5902a4dSShengchen Kan #include <atomic>
51a5902a4dSShengchen Kan #include <cassert>
52a5902a4dSShengchen Kan #include <cstdint>
53a5902a4dSShengchen Kan 
54a5902a4dSShengchen Kan using namespace llvm;
55a5902a4dSShengchen Kan 
56a5902a4dSShengchen Kan #define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
57a5902a4dSShengchen Kan #define COMP_EVEX_NAME "x86-compress-evex"
58a5902a4dSShengchen Kan 
59a5902a4dSShengchen Kan #define DEBUG_TYPE COMP_EVEX_NAME
60a5902a4dSShengchen Kan 
61a5902a4dSShengchen Kan namespace {
6217ecd23fSShengchen Kan // Including the generated EVEX compression tables.
6317ecd23fSShengchen Kan #define GET_X86_COMPRESS_EVEX_TABLE
6417ecd23fSShengchen Kan #include "X86GenInstrMapping.inc"
65a5902a4dSShengchen Kan 
66a5902a4dSShengchen Kan class CompressEVEXPass : public MachineFunctionPass {
67a5902a4dSShengchen Kan public:
68a5902a4dSShengchen Kan   static char ID;
69a5902a4dSShengchen Kan   CompressEVEXPass() : MachineFunctionPass(ID) {}
70a5902a4dSShengchen Kan   StringRef getPassName() const override { return COMP_EVEX_DESC; }
71a5902a4dSShengchen Kan 
72a5902a4dSShengchen Kan   bool runOnMachineFunction(MachineFunction &MF) override;
73a5902a4dSShengchen Kan 
74a5902a4dSShengchen Kan   // This pass runs after regalloc and doesn't support VReg operands.
75a5902a4dSShengchen Kan   MachineFunctionProperties getRequiredProperties() const override {
76a5902a4dSShengchen Kan     return MachineFunctionProperties().set(
77a5902a4dSShengchen Kan         MachineFunctionProperties::Property::NoVRegs);
78a5902a4dSShengchen Kan   }
79a5902a4dSShengchen Kan };
80a5902a4dSShengchen Kan 
81a5902a4dSShengchen Kan } // end anonymous namespace
82a5902a4dSShengchen Kan 
83a5902a4dSShengchen Kan char CompressEVEXPass::ID = 0;
84a5902a4dSShengchen Kan 
85a5902a4dSShengchen Kan static bool usesExtendedRegister(const MachineInstr &MI) {
86a5902a4dSShengchen Kan   auto isHiRegIdx = [](unsigned Reg) {
87a5902a4dSShengchen Kan     // Check for XMM register with indexes between 16 - 31.
88a5902a4dSShengchen Kan     if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
89a5902a4dSShengchen Kan       return true;
90a5902a4dSShengchen Kan     // Check for YMM register with indexes between 16 - 31.
91a5902a4dSShengchen Kan     if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
92a5902a4dSShengchen Kan       return true;
93a5902a4dSShengchen Kan     // Check for GPR with indexes between 16 - 31.
94a5902a4dSShengchen Kan     if (X86II::isApxExtendedReg(Reg))
95a5902a4dSShengchen Kan       return true;
96a5902a4dSShengchen Kan     return false;
97a5902a4dSShengchen Kan   };
98a5902a4dSShengchen Kan 
99a5902a4dSShengchen Kan   // Check that operands are not ZMM regs or
100a5902a4dSShengchen Kan   // XMM/YMM regs with hi indexes between 16 - 31.
101a5902a4dSShengchen Kan   for (const MachineOperand &MO : MI.explicit_operands()) {
102a5902a4dSShengchen Kan     if (!MO.isReg())
103a5902a4dSShengchen Kan       continue;
104a5902a4dSShengchen Kan 
105a5902a4dSShengchen Kan     Register Reg = MO.getReg();
106a5902a4dSShengchen Kan     assert(!X86II::isZMMReg(Reg) &&
107a5902a4dSShengchen Kan            "ZMM instructions should not be in the EVEX->VEX tables");
108a5902a4dSShengchen Kan     if (isHiRegIdx(Reg))
109a5902a4dSShengchen Kan       return true;
110a5902a4dSShengchen Kan   }
111a5902a4dSShengchen Kan 
112a5902a4dSShengchen Kan   return false;
113a5902a4dSShengchen Kan }
114a5902a4dSShengchen Kan 
115a5902a4dSShengchen Kan // Do any custom cleanup needed to finalize the conversion.
116a5902a4dSShengchen Kan static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
117a5902a4dSShengchen Kan   (void)NewOpc;
118a5902a4dSShengchen Kan   unsigned Opc = MI.getOpcode();
119a5902a4dSShengchen Kan   switch (Opc) {
120a5902a4dSShengchen Kan   case X86::VALIGNDZ128rri:
121a5902a4dSShengchen Kan   case X86::VALIGNDZ128rmi:
122a5902a4dSShengchen Kan   case X86::VALIGNQZ128rri:
123a5902a4dSShengchen Kan   case X86::VALIGNQZ128rmi: {
124a5902a4dSShengchen Kan     assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
125a5902a4dSShengchen Kan            "Unexpected new opcode!");
126a5902a4dSShengchen Kan     unsigned Scale =
127a5902a4dSShengchen Kan         (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
128a5902a4dSShengchen Kan     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
129a5902a4dSShengchen Kan     Imm.setImm(Imm.getImm() * Scale);
130a5902a4dSShengchen Kan     break;
131a5902a4dSShengchen Kan   }
132a5902a4dSShengchen Kan   case X86::VSHUFF32X4Z256rmi:
133a5902a4dSShengchen Kan   case X86::VSHUFF32X4Z256rri:
134a5902a4dSShengchen Kan   case X86::VSHUFF64X2Z256rmi:
135a5902a4dSShengchen Kan   case X86::VSHUFF64X2Z256rri:
136a5902a4dSShengchen Kan   case X86::VSHUFI32X4Z256rmi:
137a5902a4dSShengchen Kan   case X86::VSHUFI32X4Z256rri:
138a5902a4dSShengchen Kan   case X86::VSHUFI64X2Z256rmi:
139a5902a4dSShengchen Kan   case X86::VSHUFI64X2Z256rri: {
140614a064cSSimon Pilgrim     assert((NewOpc == X86::VPERM2F128rri || NewOpc == X86::VPERM2I128rri ||
141614a064cSSimon Pilgrim             NewOpc == X86::VPERM2F128rmi || NewOpc == X86::VPERM2I128rmi) &&
142a5902a4dSShengchen Kan            "Unexpected new opcode!");
143a5902a4dSShengchen Kan     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
144a5902a4dSShengchen Kan     int64_t ImmVal = Imm.getImm();
145a5902a4dSShengchen Kan     // Set bit 5, move bit 1 to bit 4, copy bit 0.
146a5902a4dSShengchen Kan     Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
147a5902a4dSShengchen Kan     break;
148a5902a4dSShengchen Kan   }
149a5902a4dSShengchen Kan   case X86::VRNDSCALEPDZ128rri:
150a5902a4dSShengchen Kan   case X86::VRNDSCALEPDZ128rmi:
151a5902a4dSShengchen Kan   case X86::VRNDSCALEPSZ128rri:
152a5902a4dSShengchen Kan   case X86::VRNDSCALEPSZ128rmi:
153a5902a4dSShengchen Kan   case X86::VRNDSCALEPDZ256rri:
154a5902a4dSShengchen Kan   case X86::VRNDSCALEPDZ256rmi:
155a5902a4dSShengchen Kan   case X86::VRNDSCALEPSZ256rri:
156a5902a4dSShengchen Kan   case X86::VRNDSCALEPSZ256rmi:
157*29f11f0aSSimon Pilgrim   case X86::VRNDSCALESDZrri:
158*29f11f0aSSimon Pilgrim   case X86::VRNDSCALESDZrmi:
159*29f11f0aSSimon Pilgrim   case X86::VRNDSCALESSZrri:
160*29f11f0aSSimon Pilgrim   case X86::VRNDSCALESSZrmi:
161*29f11f0aSSimon Pilgrim   case X86::VRNDSCALESDZrri_Int:
162*29f11f0aSSimon Pilgrim   case X86::VRNDSCALESDZrmi_Int:
163*29f11f0aSSimon Pilgrim   case X86::VRNDSCALESSZrri_Int:
164*29f11f0aSSimon Pilgrim   case X86::VRNDSCALESSZrmi_Int:
165a5902a4dSShengchen Kan     const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
166a5902a4dSShengchen Kan     int64_t ImmVal = Imm.getImm();
167a5902a4dSShengchen Kan     // Ensure that only bits 3:0 of the immediate are used.
168a5902a4dSShengchen Kan     if ((ImmVal & 0xf) != ImmVal)
169a5902a4dSShengchen Kan       return false;
170a5902a4dSShengchen Kan     break;
171a5902a4dSShengchen Kan   }
172a5902a4dSShengchen Kan 
173a5902a4dSShengchen Kan   return true;
174a5902a4dSShengchen Kan }
175a5902a4dSShengchen Kan 
176bdc7840cSShengchen Kan static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
177bdc7840cSShengchen Kan   uint64_t TSFlags = MI.getDesc().TSFlags;
178bdc7840cSShengchen Kan 
179bdc7840cSShengchen Kan   // Check for EVEX instructions only.
180bdc7840cSShengchen Kan   if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
181bdc7840cSShengchen Kan     return false;
182bdc7840cSShengchen Kan 
183bdc7840cSShengchen Kan   // Instructions with mask or 512-bit vector can't be converted to VEX.
184bdc7840cSShengchen Kan   if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
185bdc7840cSShengchen Kan     return false;
186bdc7840cSShengchen Kan 
187bdc7840cSShengchen Kan   auto IsRedundantNewDataDest = [&](unsigned &Opc) {
1889095eec0SShengchen Kan     // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
1899095eec0SShengchen Kan     //   ->
1909095eec0SShengchen Kan     // $rbx = ADD64rr $rbx, $rax
1919095eec0SShengchen Kan     const MCInstrDesc &Desc = MI.getDesc();
1929095eec0SShengchen Kan     Register Reg0 = MI.getOperand(0).getReg();
1939095eec0SShengchen Kan     const MachineOperand &Op1 = MI.getOperand(1);
1949c4bae7cSShengchen Kan     if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 ||
1959c4bae7cSShengchen Kan         X86::isCFCMOVCC(MI.getOpcode()))
1969095eec0SShengchen Kan       return false;
1979095eec0SShengchen Kan     Register Reg1 = Op1.getReg();
1989095eec0SShengchen Kan     if (Reg1 == Reg0)
1999095eec0SShengchen Kan       return true;
2009095eec0SShengchen Kan 
2019095eec0SShengchen Kan     // Op1 and Op2 may be commutable for ND instructions.
2029095eec0SShengchen Kan     if (!Desc.isCommutable() || Desc.getNumOperands() < 3 ||
2039095eec0SShengchen Kan         !MI.getOperand(2).isReg() || MI.getOperand(2).getReg() != Reg0)
2049095eec0SShengchen Kan       return false;
2059095eec0SShengchen Kan     // Opcode may change after commute, e.g. SHRD -> SHLD
2069095eec0SShengchen Kan     ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2);
207bdc7840cSShengchen Kan     Opc = MI.getOpcode();
2089095eec0SShengchen Kan     return true;
209bdc7840cSShengchen Kan   };
210fb72a445SShengchen Kan 
211fb72a445SShengchen Kan   // EVEX_B has several meanings.
212fb72a445SShengchen Kan   // AVX512:
213fb72a445SShengchen Kan   //  register form: rounding control or SAE
214fb72a445SShengchen Kan   //  memory form: broadcast
215fb72a445SShengchen Kan   //
216fb72a445SShengchen Kan   // APX:
217fb72a445SShengchen Kan   //  MAP4: NDD
218fb72a445SShengchen Kan   //
219fb72a445SShengchen Kan   // For AVX512 cases, EVEX prefix is needed in order to carry this information
220a5902a4dSShengchen Kan   // thus preventing the transformation to VEX encoding.
2219095eec0SShengchen Kan   bool IsND = X86II::hasNewDataDest(TSFlags);
2222acf302cSShengchen Kan   if (TSFlags & X86II::EVEX_B && !IsND)
2232acf302cSShengchen Kan     return false;
224bdc7840cSShengchen Kan   unsigned Opc = MI.getOpcode();
2252acf302cSShengchen Kan   // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B.
2262acf302cSShengchen Kan   bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr;
227bdc7840cSShengchen Kan   bool IsRedundantNDD = IsNDLike ? IsRedundantNewDataDest(Opc) : false;
228a5902a4dSShengchen Kan 
229bdc7840cSShengchen Kan   auto GetCompressedOpc = [&](unsigned Opc) -> unsigned {
23017ecd23fSShengchen Kan     ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
231a9e8a3a1SShengchen Kan     const auto I = llvm::lower_bound(Table, Opc);
232bdc7840cSShengchen Kan     if (I == Table.end() || I->OldOpc != Opc)
233bdc7840cSShengchen Kan       return 0;
234a5902a4dSShengchen Kan 
235c2bef33cSShengchen Kan     if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) ||
236fb72a445SShengchen Kan         !performCustomAdjustments(MI, I->NewOpc))
237bdc7840cSShengchen Kan       return 0;
238bdc7840cSShengchen Kan     return I->NewOpc;
239bdc7840cSShengchen Kan   };
240bdc7840cSShengchen Kan   // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
241bdc7840cSShengchen Kan   // dead.
242bdc7840cSShengchen Kan   unsigned NewOpc = IsRedundantNDD
243bdc7840cSShengchen Kan                         ? X86::getNonNDVariant(Opc)
244bdc7840cSShengchen Kan                         : ((IsNDLike && ST.hasNF() &&
245bdc7840cSShengchen Kan                             MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr))
246bdc7840cSShengchen Kan                                ? X86::getNFVariant(Opc)
247bdc7840cSShengchen Kan                                : GetCompressedOpc(Opc));
248bdc7840cSShengchen Kan 
249bdc7840cSShengchen Kan   if (!NewOpc)
250a5902a4dSShengchen Kan     return false;
251a5902a4dSShengchen Kan 
252a9e8a3a1SShengchen Kan   const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(NewOpc);
2531c674666SShengchen Kan   MI.setDesc(NewDesc);
2544f71068bSShengchen Kan   unsigned AsmComment;
2554f71068bSShengchen Kan   switch (NewDesc.TSFlags & X86II::EncodingMask) {
2564f71068bSShengchen Kan   case X86II::LEGACY:
2574f71068bSShengchen Kan     AsmComment = X86::AC_EVEX_2_LEGACY;
2584f71068bSShengchen Kan     break;
2594f71068bSShengchen Kan   case X86II::VEX:
2604f71068bSShengchen Kan     AsmComment = X86::AC_EVEX_2_VEX;
2614f71068bSShengchen Kan     break;
2624f71068bSShengchen Kan   case X86II::EVEX:
2634f71068bSShengchen Kan     AsmComment = X86::AC_EVEX_2_EVEX;
2644f71068bSShengchen Kan     assert(IsND && (NewDesc.TSFlags & X86II::EVEX_NF) &&
2654f71068bSShengchen Kan            "Unknown EVEX2EVEX compression");
2664f71068bSShengchen Kan     break;
2674f71068bSShengchen Kan   default:
2684f71068bSShengchen Kan     llvm_unreachable("Unknown EVEX compression");
2694f71068bSShengchen Kan   }
2701c674666SShengchen Kan   MI.setAsmPrinterFlag(AsmComment);
271a9e8a3a1SShengchen Kan   if (IsRedundantNDD)
2729095eec0SShengchen Kan     MI.tieOperands(0, 1);
2739095eec0SShengchen Kan 
274a5902a4dSShengchen Kan   return true;
275a5902a4dSShengchen Kan }
276a5902a4dSShengchen Kan 
277a5902a4dSShengchen Kan bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
278a5902a4dSShengchen Kan #ifndef NDEBUG
279a5902a4dSShengchen Kan   // Make sure the tables are sorted.
280a5902a4dSShengchen Kan   static std::atomic<bool> TableChecked(false);
281a5902a4dSShengchen Kan   if (!TableChecked.load(std::memory_order_relaxed)) {
2820abf3a93SShengchen Kan     assert(llvm::is_sorted(X86CompressEVEXTable) &&
2830abf3a93SShengchen Kan            "X86CompressEVEXTable is not sorted!");
284a5902a4dSShengchen Kan     TableChecked.store(true, std::memory_order_relaxed);
285a5902a4dSShengchen Kan   }
286a5902a4dSShengchen Kan #endif
287a5902a4dSShengchen Kan   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
288fb72a445SShengchen Kan   if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD())
289a5902a4dSShengchen Kan     return false;
290a5902a4dSShengchen Kan 
291a5902a4dSShengchen Kan   bool Changed = false;
292a5902a4dSShengchen Kan 
293a5902a4dSShengchen Kan   for (MachineBasicBlock &MBB : MF) {
294a5902a4dSShengchen Kan     // Traverse the basic block.
295a5902a4dSShengchen Kan     for (MachineInstr &MI : MBB)
2960abf3a93SShengchen Kan       Changed |= CompressEVEXImpl(MI, ST);
297a5902a4dSShengchen Kan   }
298a5902a4dSShengchen Kan 
299a5902a4dSShengchen Kan   return Changed;
300a5902a4dSShengchen Kan }
301a5902a4dSShengchen Kan 
302a5902a4dSShengchen Kan INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false)
303a5902a4dSShengchen Kan 
304a5902a4dSShengchen Kan FunctionPass *llvm::createX86CompressEVEXPass() {
305a5902a4dSShengchen Kan   return new CompressEVEXPass();
306a5902a4dSShengchen Kan }
307