1 //===-- X86FixupInstTunings.cpp - replace instructions -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file does a tuning pass replacing slower machine instructions 10 // with faster ones. We do this here, as opposed to during normal ISel, as 11 // attempting to get the "right" instruction can break patterns. This pass 12 // is not meant search for special cases where an instruction can be transformed 13 // to another, it is only meant to do transformations where the old instruction 14 // is always replacable with the new instructions. For example: 15 // 16 // `vpermq ymm` -> `vshufd ymm` 17 // -- BAD, not always valid (lane cross/non-repeated mask) 18 // 19 // `vpermilps ymm` -> `vshufd ymm` 20 // -- GOOD, always replaceable 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "X86.h" 25 #include "X86InstrInfo.h" 26 #include "X86Subtarget.h" 27 #include "llvm/ADT/Statistic.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachineInstrBuilder.h" 30 #include "llvm/CodeGen/MachineRegisterInfo.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "x86-fixup-inst-tuning" 35 36 STATISTIC(NumInstChanges, "Number of instructions changes"); 37 38 namespace { 39 class X86FixupInstTuningPass : public MachineFunctionPass { 40 public: 41 static char ID; 42 43 X86FixupInstTuningPass() : MachineFunctionPass(ID) {} 44 45 StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; } 46 47 bool runOnMachineFunction(MachineFunction &MF) override; 48 bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, 49 MachineBasicBlock::iterator &I); 50 51 // This pass runs after regalloc and doesn't support VReg operands. 52 MachineFunctionProperties getRequiredProperties() const override { 53 return MachineFunctionProperties().set( 54 MachineFunctionProperties::Property::NoVRegs); 55 } 56 57 private: 58 const X86InstrInfo *TII = nullptr; 59 const X86Subtarget *ST = nullptr; 60 const MCSchedModel *SM = nullptr; 61 }; 62 } // end anonymous namespace 63 64 char X86FixupInstTuningPass::ID = 0; 65 66 INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false) 67 68 FunctionPass *llvm::createX86FixupInstTuning() { 69 return new X86FixupInstTuningPass(); 70 } 71 72 template <typename T> 73 static std::optional<bool> CmpOptionals(T NewVal, T CurVal) { 74 if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal) 75 return *NewVal < *CurVal; 76 77 return std::nullopt; 78 } 79 80 bool X86FixupInstTuningPass::processInstruction( 81 MachineFunction &MF, MachineBasicBlock &MBB, 82 MachineBasicBlock::iterator &I) { 83 MachineInstr &MI = *I; 84 unsigned Opc = MI.getOpcode(); 85 unsigned NumOperands = MI.getDesc().getNumOperands(); 86 87 auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> { 88 // We already checked that SchedModel exists in `NewOpcPreferable`. 89 return MCSchedModel::getReciprocalThroughput( 90 *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass()))); 91 }; 92 93 auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> { 94 // We already checked that SchedModel exists in `NewOpcPreferable`. 95 return MCSchedModel::computeInstrLatency( 96 *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass()))); 97 }; 98 99 auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> { 100 if (unsigned Size = TII->get(Opcode).getSize()) 101 return Size; 102 // Zero size means we where unable to compute it. 103 return std::nullopt; 104 }; 105 106 auto NewOpcPreferable = [&](unsigned NewOpc, 107 bool ReplaceInTie = true) -> bool { 108 std::optional<bool> Res; 109 if (SM->hasInstrSchedModel()) { 110 // Compare tput -> lat -> code size. 111 Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc)); 112 if (Res.has_value()) 113 return *Res; 114 115 Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc)); 116 if (Res.has_value()) 117 return *Res; 118 } 119 120 Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc)); 121 if (Res.has_value()) 122 return *Res; 123 124 // We either have either were unable to get tput/lat/codesize or all values 125 // were equal. Return specified option for a tie. 126 return ReplaceInTie; 127 }; 128 129 // `vpermilps r, i` -> `vshufps r, r, i` 130 // `vpermilps r, i, k` -> `vshufps r, r, i, k` 131 // `vshufps` is always as fast or faster than `vpermilps` and takes 132 // 1 less byte of code size for VEX and SSE encoding. 133 auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool { 134 if (!NewOpcPreferable(NewOpc)) 135 return false; 136 unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm(); 137 MI.removeOperand(NumOperands - 1); 138 MI.addOperand(MI.getOperand(NumOperands - 2)); 139 MI.setDesc(TII->get(NewOpc)); 140 MI.addOperand(MachineOperand::CreateImm(MaskImm)); 141 return true; 142 }; 143 144 // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles. 145 // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less 146 // byte of code size. 147 auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool { 148 // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as 149 // `vpshufd` saves a byte of code size. 150 if (!ST->hasNoDomainDelayShuffle() && 151 !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) 152 return false; 153 MI.setDesc(TII->get(NewOpc)); 154 return true; 155 }; 156 157 // `vunpcklpd/vmovlhps r, r` -> `vshufpd r, r, 0x00` 158 // `vunpckhpd/vmovlhps r, r` -> `vshufpd r, r, 0xff` 159 // `vunpcklpd r, r, k` -> `vshufpd r, r, 0x00` 160 // `vunpckhpd r, r, k` -> `vshufpd r, r, 0xff` 161 // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with 162 // `vunpck{l|h}pd` as it uses less code size. 163 // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS` 164 // as the replacement. `{VP}UNPCK{L|H}QDQ{...}` has no codesize cost. 165 auto ProcessUNPCKPD = [&](unsigned NewOpc, unsigned MaskImm) -> bool { 166 if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) 167 return false; 168 169 MI.setDesc(TII->get(NewOpc)); 170 MI.addOperand(MachineOperand::CreateImm(MaskImm)); 171 return true; 172 }; 173 174 auto ProcessUNPCKLPDrr = [&](unsigned NewOpc) -> bool { 175 return ProcessUNPCKPD(NewOpc, 0x00); 176 }; 177 auto ProcessUNPCKHPDrr = [&](unsigned NewOpc) -> bool { 178 return ProcessUNPCKPD(NewOpc, 0xff); 179 }; 180 181 switch (Opc) { 182 case X86::VPERMILPSri: 183 return ProcessVPERMILPSri(X86::VSHUFPSrri); 184 case X86::VPERMILPSYri: 185 return ProcessVPERMILPSri(X86::VSHUFPSYrri); 186 case X86::VPERMILPSZ128ri: 187 return ProcessVPERMILPSri(X86::VSHUFPSZ128rri); 188 case X86::VPERMILPSZ256ri: 189 return ProcessVPERMILPSri(X86::VSHUFPSZ256rri); 190 case X86::VPERMILPSZri: 191 return ProcessVPERMILPSri(X86::VSHUFPSZrri); 192 case X86::VPERMILPSZ128rikz: 193 return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz); 194 case X86::VPERMILPSZ256rikz: 195 return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz); 196 case X86::VPERMILPSZrikz: 197 return ProcessVPERMILPSri(X86::VSHUFPSZrrikz); 198 case X86::VPERMILPSZ128rik: 199 return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik); 200 case X86::VPERMILPSZ256rik: 201 return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik); 202 case X86::VPERMILPSZrik: 203 return ProcessVPERMILPSri(X86::VSHUFPSZrrik); 204 case X86::VPERMILPSmi: 205 return ProcessVPERMILPSmi(X86::VPSHUFDmi); 206 case X86::VPERMILPSYmi: 207 // TODO: See if there is a more generic way we can test if the replacement 208 // instruction is supported. 209 return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false; 210 case X86::VPERMILPSZ128mi: 211 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi); 212 case X86::VPERMILPSZ256mi: 213 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi); 214 case X86::VPERMILPSZmi: 215 return ProcessVPERMILPSmi(X86::VPSHUFDZmi); 216 case X86::VPERMILPSZ128mikz: 217 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz); 218 case X86::VPERMILPSZ256mikz: 219 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz); 220 case X86::VPERMILPSZmikz: 221 return ProcessVPERMILPSmi(X86::VPSHUFDZmikz); 222 case X86::VPERMILPSZ128mik: 223 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik); 224 case X86::VPERMILPSZ256mik: 225 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik); 226 case X86::VPERMILPSZmik: 227 return ProcessVPERMILPSmi(X86::VPSHUFDZmik); 228 229 // TODO: {V}UNPCK{L|H}PD{...} is probably safe to transform to 230 // `{VP}UNPCK{L|H}QDQ{...}` which gets the same perf benefit as 231 // `{V}SHUF{...}PS` but 1) without increasing code size and 2) can also 232 // handle the `mr` case. ICL doesn't have a domain penalty for replacing 233 // float unpck -> int unpck, but at this time, I haven't verified the set of 234 // processors where its safe. 235 case X86::MOVLHPSrr: 236 case X86::UNPCKLPDrr: 237 return ProcessUNPCKLPDrr(X86::SHUFPDrri); 238 case X86::VMOVLHPSrr: 239 case X86::VUNPCKLPDrr: 240 return ProcessUNPCKLPDrr(X86::VSHUFPDrri); 241 case X86::VUNPCKLPDYrr: 242 return ProcessUNPCKLPDrr(X86::VSHUFPDYrri); 243 // VMOVLHPS is always 128 bits. 244 case X86::VMOVLHPSZrr: 245 case X86::VUNPCKLPDZ128rr: 246 return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rri); 247 case X86::VUNPCKLPDZ256rr: 248 return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rri); 249 case X86::VUNPCKLPDZrr: 250 return ProcessUNPCKLPDrr(X86::VSHUFPDZrri); 251 case X86::VUNPCKLPDZ128rrk: 252 return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrik); 253 case X86::VUNPCKLPDZ256rrk: 254 return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrik); 255 case X86::VUNPCKLPDZrrk: 256 return ProcessUNPCKLPDrr(X86::VSHUFPDZrrik); 257 case X86::VUNPCKLPDZ128rrkz: 258 return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrikz); 259 case X86::VUNPCKLPDZ256rrkz: 260 return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrikz); 261 case X86::VUNPCKLPDZrrkz: 262 return ProcessUNPCKLPDrr(X86::VSHUFPDZrrikz); 263 case X86::UNPCKHPDrr: 264 return ProcessUNPCKHPDrr(X86::SHUFPDrri); 265 case X86::VUNPCKHPDrr: 266 return ProcessUNPCKHPDrr(X86::VSHUFPDrri); 267 case X86::VUNPCKHPDYrr: 268 return ProcessUNPCKHPDrr(X86::VSHUFPDYrri); 269 case X86::VUNPCKHPDZ128rr: 270 return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rri); 271 case X86::VUNPCKHPDZ256rr: 272 return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rri); 273 case X86::VUNPCKHPDZrr: 274 return ProcessUNPCKHPDrr(X86::VSHUFPDZrri); 275 case X86::VUNPCKHPDZ128rrk: 276 return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrik); 277 case X86::VUNPCKHPDZ256rrk: 278 return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrik); 279 case X86::VUNPCKHPDZrrk: 280 return ProcessUNPCKHPDrr(X86::VSHUFPDZrrik); 281 case X86::VUNPCKHPDZ128rrkz: 282 return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrikz); 283 case X86::VUNPCKHPDZ256rrkz: 284 return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrikz); 285 case X86::VUNPCKHPDZrrkz: 286 return ProcessUNPCKHPDrr(X86::VSHUFPDZrrikz); 287 default: 288 return false; 289 } 290 } 291 292 bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) { 293 LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";); 294 bool Changed = false; 295 ST = &MF.getSubtarget<X86Subtarget>(); 296 TII = ST->getInstrInfo(); 297 SM = &ST->getSchedModel(); 298 299 for (MachineBasicBlock &MBB : MF) { 300 for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { 301 if (processInstruction(MF, MBB, I)) { 302 ++NumInstChanges; 303 Changed = true; 304 } 305 } 306 } 307 LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";); 308 return Changed; 309 } 310