1 //===-- X86FixupInstTunings.cpp - replace instructions -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file does a tuning pass replacing slower machine instructions 10 // with faster ones. We do this here, as opposed to during normal ISel, as 11 // attempting to get the "right" instruction can break patterns. This pass 12 // is not meant search for special cases where an instruction can be transformed 13 // to another, it is only meant to do transformations where the old instruction 14 // is always replacable with the new instructions. For example: 15 // 16 // `vpermq ymm` -> `vshufd ymm` 17 // -- BAD, not always valid (lane cross/non-repeated mask) 18 // 19 // `vpermilps ymm` -> `vshufd ymm` 20 // -- GOOD, always replaceable 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "X86.h" 25 #include "X86InstrInfo.h" 26 #include "X86Subtarget.h" 27 #include "llvm/ADT/Statistic.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachineInstrBuilder.h" 30 #include "llvm/CodeGen/MachineRegisterInfo.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "x86-fixup-inst-tuning" 35 36 STATISTIC(NumInstChanges, "Number of instructions changes"); 37 38 namespace { 39 class X86FixupInstTuningPass : public MachineFunctionPass { 40 public: 41 static char ID; 42 43 X86FixupInstTuningPass() : MachineFunctionPass(ID) {} 44 45 StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; } 46 47 bool runOnMachineFunction(MachineFunction &MF) override; 48 bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, 49 MachineBasicBlock::iterator &I); 50 51 // This pass runs after regalloc and doesn't support VReg operands. 52 MachineFunctionProperties getRequiredProperties() const override { 53 return MachineFunctionProperties().set( 54 MachineFunctionProperties::Property::NoVRegs); 55 } 56 57 private: 58 const X86InstrInfo *TII = nullptr; 59 const X86Subtarget *ST = nullptr; 60 const MCSchedModel *SM = nullptr; 61 }; 62 } // end anonymous namespace 63 64 char X86FixupInstTuningPass::ID = 0; 65 66 INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false) 67 68 FunctionPass *llvm::createX86FixupInstTuning() { 69 return new X86FixupInstTuningPass(); 70 } 71 72 template <typename T> 73 static std::optional<bool> CmpOptionals(T NewVal, T CurVal) { 74 if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal) 75 return *NewVal < *CurVal; 76 77 return std::nullopt; 78 } 79 80 bool X86FixupInstTuningPass::processInstruction( 81 MachineFunction &MF, MachineBasicBlock &MBB, 82 MachineBasicBlock::iterator &I) { 83 MachineInstr &MI = *I; 84 unsigned Opc = MI.getOpcode(); 85 unsigned NumOperands = MI.getDesc().getNumOperands(); 86 87 auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> { 88 // We already checked that SchedModel exists in `NewOpcPreferable`. 89 return MCSchedModel::getReciprocalThroughput( 90 *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass()))); 91 }; 92 93 auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> { 94 // We already checked that SchedModel exists in `NewOpcPreferable`. 95 return MCSchedModel::computeInstrLatency( 96 *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass()))); 97 }; 98 99 auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> { 100 if (unsigned Size = TII->get(Opcode).getSize()) 101 return Size; 102 // Zero size means we where unable to compute it. 103 return std::nullopt; 104 }; 105 106 auto NewOpcPreferable = [&](unsigned NewOpc, 107 bool ReplaceInTie = true) -> bool { 108 std::optional<bool> Res; 109 if (SM->hasInstrSchedModel()) { 110 // Compare tput -> lat -> code size. 111 Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc)); 112 if (Res.has_value()) 113 return *Res; 114 115 Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc)); 116 if (Res.has_value()) 117 return *Res; 118 } 119 120 Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc)); 121 if (Res.has_value()) 122 return *Res; 123 124 // We either have either were unable to get tput/lat/codesize or all values 125 // were equal. Return specified option for a tie. 126 return ReplaceInTie; 127 }; 128 129 // `vpermilps r, i` -> `vshufps r, r, i` 130 // `vpermilps r, i, k` -> `vshufps r, r, i, k` 131 // `vshufps` is always as fast or faster than `vpermilps` and takes 132 // 1 less byte of code size for VEX and SSE encoding. 133 auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool { 134 if (!NewOpcPreferable(NewOpc)) 135 return false; 136 unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm(); 137 MI.removeOperand(NumOperands - 1); 138 MI.addOperand(MI.getOperand(NumOperands - 2)); 139 MI.setDesc(TII->get(NewOpc)); 140 MI.addOperand(MachineOperand::CreateImm(MaskImm)); 141 return true; 142 }; 143 144 // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles. 145 // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less 146 // byte of code size. 147 auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool { 148 // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as 149 // `vpshufd` saves a byte of code size. 150 if (!ST->hasNoDomainDelayShuffle() || 151 !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) 152 return false; 153 MI.setDesc(TII->get(NewOpc)); 154 return true; 155 }; 156 157 // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00` 158 // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff` 159 // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00` 160 // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff` 161 // `vunpcklpd r, m` -> `vunpcklqdq r, m, k` 162 // `vunpckhpd r, m` -> `vunpckhqdq r, m, k` 163 // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k` 164 // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k` 165 // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd` 166 // -> `vunpck{l|h}qdq` 167 // 2) If `vshufpd` faster than `vunpck{l|h}pd` 168 // -> `vshufpd` 169 auto ProcessUNPCKPD = [&](unsigned NewOpc, unsigned MaskImm) -> bool { 170 if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) 171 return false; 172 173 MI.setDesc(TII->get(NewOpc)); 174 MI.addOperand(MachineOperand::CreateImm(MaskImm)); 175 return true; 176 }; 177 178 auto ProcessUNPCKPDToIntDomain = [&](unsigned NewOpc) -> bool { 179 // TODO it may be worth it to set ReplaceInTie to `true` as there is no real 180 // downside to the integer unpck, but if someone doesn't specify exact 181 // target we won't find it faster. 182 if (!ST->hasNoDomainDelayShuffle() || 183 !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) 184 return false; 185 MI.setDesc(TII->get(NewOpc)); 186 return true; 187 }; 188 189 auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain, 190 unsigned NewOpc) -> bool { 191 if (ProcessUNPCKPDToIntDomain(NewOpcIntDomain)) 192 return true; 193 return ProcessUNPCKPD(NewOpc, 0x00); 194 }; 195 auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain, 196 unsigned NewOpc) -> bool { 197 if (ProcessUNPCKPDToIntDomain(NewOpcIntDomain)) 198 return true; 199 return ProcessUNPCKPD(NewOpc, 0xff); 200 }; 201 202 auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool { 203 return ProcessUNPCKPDToIntDomain(NewOpcIntDomain); 204 }; 205 206 switch (Opc) { 207 case X86::VPERMILPSri: 208 return ProcessVPERMILPSri(X86::VSHUFPSrri); 209 case X86::VPERMILPSYri: 210 return ProcessVPERMILPSri(X86::VSHUFPSYrri); 211 case X86::VPERMILPSZ128ri: 212 return ProcessVPERMILPSri(X86::VSHUFPSZ128rri); 213 case X86::VPERMILPSZ256ri: 214 return ProcessVPERMILPSri(X86::VSHUFPSZ256rri); 215 case X86::VPERMILPSZri: 216 return ProcessVPERMILPSri(X86::VSHUFPSZrri); 217 case X86::VPERMILPSZ128rikz: 218 return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz); 219 case X86::VPERMILPSZ256rikz: 220 return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz); 221 case X86::VPERMILPSZrikz: 222 return ProcessVPERMILPSri(X86::VSHUFPSZrrikz); 223 case X86::VPERMILPSZ128rik: 224 return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik); 225 case X86::VPERMILPSZ256rik: 226 return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik); 227 case X86::VPERMILPSZrik: 228 return ProcessVPERMILPSri(X86::VSHUFPSZrrik); 229 case X86::VPERMILPSmi: 230 return ProcessVPERMILPSmi(X86::VPSHUFDmi); 231 case X86::VPERMILPSYmi: 232 // TODO: See if there is a more generic way we can test if the replacement 233 // instruction is supported. 234 return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false; 235 case X86::VPERMILPSZ128mi: 236 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi); 237 case X86::VPERMILPSZ256mi: 238 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi); 239 case X86::VPERMILPSZmi: 240 return ProcessVPERMILPSmi(X86::VPSHUFDZmi); 241 case X86::VPERMILPSZ128mikz: 242 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz); 243 case X86::VPERMILPSZ256mikz: 244 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz); 245 case X86::VPERMILPSZmikz: 246 return ProcessVPERMILPSmi(X86::VPSHUFDZmikz); 247 case X86::VPERMILPSZ128mik: 248 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik); 249 case X86::VPERMILPSZ256mik: 250 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik); 251 case X86::VPERMILPSZmik: 252 return ProcessVPERMILPSmi(X86::VPSHUFDZmik); 253 254 case X86::MOVLHPSrr: 255 case X86::UNPCKLPDrr: 256 return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri); 257 case X86::VMOVLHPSrr: 258 case X86::VUNPCKLPDrr: 259 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri); 260 case X86::VUNPCKLPDYrr: 261 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri); 262 // VMOVLHPS is always 128 bits. 263 case X86::VMOVLHPSZrr: 264 case X86::VUNPCKLPDZ128rr: 265 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri); 266 case X86::VUNPCKLPDZ256rr: 267 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri); 268 case X86::VUNPCKLPDZrr: 269 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri); 270 case X86::VUNPCKLPDZ128rrk: 271 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik); 272 case X86::VUNPCKLPDZ256rrk: 273 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik); 274 case X86::VUNPCKLPDZrrk: 275 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik); 276 case X86::VUNPCKLPDZ128rrkz: 277 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz); 278 case X86::VUNPCKLPDZ256rrkz: 279 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz); 280 case X86::VUNPCKLPDZrrkz: 281 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz); 282 case X86::UNPCKHPDrr: 283 return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri); 284 case X86::VUNPCKHPDrr: 285 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri); 286 case X86::VUNPCKHPDYrr: 287 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri); 288 case X86::VUNPCKHPDZ128rr: 289 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri); 290 case X86::VUNPCKHPDZ256rr: 291 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri); 292 case X86::VUNPCKHPDZrr: 293 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri); 294 case X86::VUNPCKHPDZ128rrk: 295 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik); 296 case X86::VUNPCKHPDZ256rrk: 297 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik); 298 case X86::VUNPCKHPDZrrk: 299 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik); 300 case X86::VUNPCKHPDZ128rrkz: 301 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz); 302 case X86::VUNPCKHPDZ256rrkz: 303 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz); 304 case X86::VUNPCKHPDZrrkz: 305 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz); 306 case X86::UNPCKLPDrm: 307 return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm); 308 case X86::VUNPCKLPDrm: 309 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm); 310 case X86::VUNPCKLPDYrm: 311 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm); 312 case X86::VUNPCKLPDZ128rm: 313 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm); 314 case X86::VUNPCKLPDZ256rm: 315 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm); 316 case X86::VUNPCKLPDZrm: 317 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm); 318 case X86::VUNPCKLPDZ128rmk: 319 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk); 320 case X86::VUNPCKLPDZ256rmk: 321 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk); 322 case X86::VUNPCKLPDZrmk: 323 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk); 324 case X86::VUNPCKLPDZ128rmkz: 325 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz); 326 case X86::VUNPCKLPDZ256rmkz: 327 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz); 328 case X86::VUNPCKLPDZrmkz: 329 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz); 330 case X86::UNPCKHPDrm: 331 return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm); 332 case X86::VUNPCKHPDrm: 333 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm); 334 case X86::VUNPCKHPDYrm: 335 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm); 336 case X86::VUNPCKHPDZ128rm: 337 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm); 338 case X86::VUNPCKHPDZ256rm: 339 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm); 340 case X86::VUNPCKHPDZrm: 341 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm); 342 case X86::VUNPCKHPDZ128rmk: 343 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk); 344 case X86::VUNPCKHPDZ256rmk: 345 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk); 346 case X86::VUNPCKHPDZrmk: 347 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk); 348 case X86::VUNPCKHPDZ128rmkz: 349 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz); 350 case X86::VUNPCKHPDZ256rmkz: 351 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz); 352 case X86::VUNPCKHPDZrmkz: 353 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz); 354 default: 355 return false; 356 } 357 } 358 359 bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) { 360 LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";); 361 bool Changed = false; 362 ST = &MF.getSubtarget<X86Subtarget>(); 363 TII = ST->getInstrInfo(); 364 SM = &ST->getSchedModel(); 365 366 for (MachineBasicBlock &MBB : MF) { 367 for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { 368 if (processInstruction(MF, MBB, I)) { 369 ++NumInstChanges; 370 Changed = true; 371 } 372 } 373 } 374 LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";); 375 return Changed; 376 } 377