1 //===- X86CompressEVEX.cpp ------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass compresses instructions from EVEX space to legacy/VEX/EVEX space 10 // when possible in order to reduce code size or facilitate HW decoding. 11 // 12 // Possible compression: 13 // a. AVX512 instruction (EVEX) -> AVX instruction (VEX) 14 // b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX) 15 // c. NDD (EVEX) -> non-NDD (legacy) 16 // d. NF_ND (EVEX) -> NF (EVEX) 17 // 18 // Compression a, b and c can always reduce code size, with some exceptions 19 // such as promoted 16-bit CRC32 which is as long as the legacy version. 20 // 21 // legacy: 22 // crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6] 23 // promoted: 24 // crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6] 25 // 26 // From performance perspective, these should be same (same uops and same EXE 27 // ports). From a FMV perspective, an older legacy encoding is preferred b/c it 28 // can execute in more places (broader HW install base). So we will still do 29 // the compression. 30 // 31 // Compression d can help hardware decode (HW may skip reading the NDD 32 // register) although the instruction length remains unchanged. 33 //===----------------------------------------------------------------------===// 34 35 #include "MCTargetDesc/X86BaseInfo.h" 36 #include "MCTargetDesc/X86InstComments.h" 37 #include "X86.h" 38 #include "X86InstrInfo.h" 39 #include "X86Subtarget.h" 40 #include "llvm/ADT/StringRef.h" 41 #include "llvm/CodeGen/MachineFunction.h" 42 #include "llvm/CodeGen/MachineFunctionPass.h" 43 #include "llvm/CodeGen/MachineInstr.h" 44 #include "llvm/CodeGen/MachineOperand.h" 45 #include "llvm/MC/MCInstrDesc.h" 46 #include "llvm/Pass.h" 47 #include <atomic> 48 #include <cassert> 49 #include <cstdint> 50 51 using namespace llvm; 52 53 #define COMP_EVEX_DESC "Compressing EVEX instrs when possible" 54 #define COMP_EVEX_NAME "x86-compress-evex" 55 56 #define DEBUG_TYPE COMP_EVEX_NAME 57 58 namespace { 59 // Including the generated EVEX compression tables. 60 #define GET_X86_COMPRESS_EVEX_TABLE 61 #include "X86GenInstrMapping.inc" 62 63 class CompressEVEXPass : public MachineFunctionPass { 64 public: 65 static char ID; 66 CompressEVEXPass() : MachineFunctionPass(ID) {} 67 StringRef getPassName() const override { return COMP_EVEX_DESC; } 68 69 bool runOnMachineFunction(MachineFunction &MF) override; 70 71 // This pass runs after regalloc and doesn't support VReg operands. 72 MachineFunctionProperties getRequiredProperties() const override { 73 return MachineFunctionProperties().set( 74 MachineFunctionProperties::Property::NoVRegs); 75 } 76 }; 77 78 } // end anonymous namespace 79 80 char CompressEVEXPass::ID = 0; 81 82 static bool usesExtendedRegister(const MachineInstr &MI) { 83 auto isHiRegIdx = [](unsigned Reg) { 84 // Check for XMM register with indexes between 16 - 31. 85 if (Reg >= X86::XMM16 && Reg <= X86::XMM31) 86 return true; 87 // Check for YMM register with indexes between 16 - 31. 88 if (Reg >= X86::YMM16 && Reg <= X86::YMM31) 89 return true; 90 // Check for GPR with indexes between 16 - 31. 91 if (X86II::isApxExtendedReg(Reg)) 92 return true; 93 return false; 94 }; 95 96 // Check that operands are not ZMM regs or 97 // XMM/YMM regs with hi indexes between 16 - 31. 98 for (const MachineOperand &MO : MI.explicit_operands()) { 99 if (!MO.isReg()) 100 continue; 101 102 Register Reg = MO.getReg(); 103 assert(!X86II::isZMMReg(Reg) && 104 "ZMM instructions should not be in the EVEX->VEX tables"); 105 if (isHiRegIdx(Reg)) 106 return true; 107 } 108 109 return false; 110 } 111 112 // Do any custom cleanup needed to finalize the conversion. 113 static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { 114 (void)NewOpc; 115 unsigned Opc = MI.getOpcode(); 116 switch (Opc) { 117 case X86::VALIGNDZ128rri: 118 case X86::VALIGNDZ128rmi: 119 case X86::VALIGNQZ128rri: 120 case X86::VALIGNQZ128rmi: { 121 assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) && 122 "Unexpected new opcode!"); 123 unsigned Scale = 124 (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4; 125 MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); 126 Imm.setImm(Imm.getImm() * Scale); 127 break; 128 } 129 case X86::VSHUFF32X4Z256rmi: 130 case X86::VSHUFF32X4Z256rri: 131 case X86::VSHUFF64X2Z256rmi: 132 case X86::VSHUFF64X2Z256rri: 133 case X86::VSHUFI32X4Z256rmi: 134 case X86::VSHUFI32X4Z256rri: 135 case X86::VSHUFI64X2Z256rmi: 136 case X86::VSHUFI64X2Z256rri: { 137 assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr || 138 NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) && 139 "Unexpected new opcode!"); 140 MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); 141 int64_t ImmVal = Imm.getImm(); 142 // Set bit 5, move bit 1 to bit 4, copy bit 0. 143 Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1)); 144 break; 145 } 146 case X86::VRNDSCALEPDZ128rri: 147 case X86::VRNDSCALEPDZ128rmi: 148 case X86::VRNDSCALEPSZ128rri: 149 case X86::VRNDSCALEPSZ128rmi: 150 case X86::VRNDSCALEPDZ256rri: 151 case X86::VRNDSCALEPDZ256rmi: 152 case X86::VRNDSCALEPSZ256rri: 153 case X86::VRNDSCALEPSZ256rmi: 154 case X86::VRNDSCALESDZr: 155 case X86::VRNDSCALESDZm: 156 case X86::VRNDSCALESSZr: 157 case X86::VRNDSCALESSZm: 158 case X86::VRNDSCALESDZr_Int: 159 case X86::VRNDSCALESDZm_Int: 160 case X86::VRNDSCALESSZr_Int: 161 case X86::VRNDSCALESSZm_Int: 162 const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); 163 int64_t ImmVal = Imm.getImm(); 164 // Ensure that only bits 3:0 of the immediate are used. 165 if ((ImmVal & 0xf) != ImmVal) 166 return false; 167 break; 168 } 169 170 return true; 171 } 172 173 static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) { 174 // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx 175 // -> 176 // $rbx = ADD64rr $rbx, $rax 177 const MCInstrDesc &Desc = MI.getDesc(); 178 Register Reg0 = MI.getOperand(0).getReg(); 179 const MachineOperand &Op1 = MI.getOperand(1); 180 if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1) 181 return false; 182 Register Reg1 = Op1.getReg(); 183 if (Reg1 == Reg0) 184 return true; 185 186 // Op1 and Op2 may be commutable for ND instructions. 187 if (!Desc.isCommutable() || Desc.getNumOperands() < 3 || 188 !MI.getOperand(2).isReg() || MI.getOperand(2).getReg() != Reg0) 189 return false; 190 // Opcode may change after commute, e.g. SHRD -> SHLD 191 ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2); 192 return true; 193 } 194 195 static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { 196 uint64_t TSFlags = MI.getDesc().TSFlags; 197 198 // Check for EVEX instructions only. 199 if ((TSFlags & X86II::EncodingMask) != X86II::EVEX) 200 return false; 201 202 // Instructions with mask or 512-bit vector can't be converted to VEX. 203 if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2)) 204 return false; 205 206 // EVEX_B has several meanings. 207 // AVX512: 208 // register form: rounding control or SAE 209 // memory form: broadcast 210 // 211 // APX: 212 // MAP4: NDD 213 // 214 // For AVX512 cases, EVEX prefix is needed in order to carry this information 215 // thus preventing the transformation to VEX encoding. 216 unsigned Opc = MI.getOpcode(); 217 bool IsND = X86II::hasNewDataDest(TSFlags); 218 if (TSFlags & X86II::EVEX_B && !IsND) 219 return false; 220 // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B. 221 bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr; 222 if (IsNDLike && !isRedundantNewDataDest(MI, ST)) 223 return false; 224 225 ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable); 226 227 Opc = MI.getOpcode(); 228 const auto *I = llvm::lower_bound(Table, Opc); 229 if (I == Table.end() || I->OldOpc != Opc) { 230 assert(!IsNDLike && "Missing entry for ND-like instruction"); 231 return false; 232 } 233 234 if (!IsNDLike) { 235 if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) || 236 !performCustomAdjustments(MI, I->NewOpc)) 237 return false; 238 } 239 240 const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc); 241 MI.setDesc(NewDesc); 242 unsigned AsmComment; 243 switch (NewDesc.TSFlags & X86II::EncodingMask) { 244 case X86II::LEGACY: 245 AsmComment = X86::AC_EVEX_2_LEGACY; 246 break; 247 case X86II::VEX: 248 AsmComment = X86::AC_EVEX_2_VEX; 249 break; 250 case X86II::EVEX: 251 AsmComment = X86::AC_EVEX_2_EVEX; 252 assert(IsND && (NewDesc.TSFlags & X86II::EVEX_NF) && 253 "Unknown EVEX2EVEX compression"); 254 break; 255 default: 256 llvm_unreachable("Unknown EVEX compression"); 257 } 258 MI.setAsmPrinterFlag(AsmComment); 259 if (IsNDLike) 260 MI.tieOperands(0, 1); 261 262 return true; 263 } 264 265 bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) { 266 #ifndef NDEBUG 267 // Make sure the tables are sorted. 268 static std::atomic<bool> TableChecked(false); 269 if (!TableChecked.load(std::memory_order_relaxed)) { 270 assert(llvm::is_sorted(X86CompressEVEXTable) && 271 "X86CompressEVEXTable is not sorted!"); 272 TableChecked.store(true, std::memory_order_relaxed); 273 } 274 #endif 275 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); 276 if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD()) 277 return false; 278 279 bool Changed = false; 280 281 for (MachineBasicBlock &MBB : MF) { 282 // Traverse the basic block. 283 for (MachineInstr &MI : MBB) 284 Changed |= CompressEVEXImpl(MI, ST); 285 } 286 287 return Changed; 288 } 289 290 INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false) 291 292 FunctionPass *llvm::createX86CompressEVEXPass() { 293 return new CompressEVEXPass(); 294 } 295