1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //===----------------------------------------------------------------------===// 10 // 11 12 #include "AMDGPU.h" 13 #include "AMDGPUSubtarget.h" 14 #include "SIInstrInfo.h" 15 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 16 #include "llvm/CodeGen/MachineDominators.h" 17 #include "llvm/CodeGen/MachineFunctionPass.h" 18 #include "llvm/CodeGen/MachineInstrBuilder.h" 19 #include "llvm/CodeGen/MachineRegisterInfo.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/LLVMContext.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/raw_ostream.h" 24 #include "llvm/Target/TargetMachine.h" 25 26 #define DEBUG_TYPE "si-fold-operands" 27 using namespace llvm; 28 29 namespace { 30 31 class SIFoldOperands : public MachineFunctionPass { 32 public: 33 static char ID; 34 35 public: 36 SIFoldOperands() : MachineFunctionPass(ID) { 37 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 38 } 39 40 bool runOnMachineFunction(MachineFunction &MF) override; 41 42 const char *getPassName() const override { 43 return "SI Fold Operands"; 44 } 45 46 void getAnalysisUsage(AnalysisUsage &AU) const override { 47 AU.addRequired<MachineDominatorTree>(); 48 AU.addPreserved<MachineDominatorTree>(); 49 AU.setPreservesCFG(); 50 MachineFunctionPass::getAnalysisUsage(AU); 51 } 52 }; 53 54 struct FoldCandidate { 55 MachineInstr *UseMI; 56 unsigned UseOpNo; 57 MachineOperand *OpToFold; 58 uint64_t ImmToFold; 59 60 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : 61 UseMI(MI), UseOpNo(OpNo) { 62 63 if (FoldOp->isImm()) { 64 OpToFold = nullptr; 65 ImmToFold = FoldOp->getImm(); 66 } else { 67 assert(FoldOp->isReg()); 68 OpToFold = FoldOp; 69 } 70 } 71 72 bool isImm() const { 73 return !OpToFold; 74 } 75 }; 76 77 } // End anonymous namespace. 78 79 INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, 80 "SI Fold Operands", false, false) 81 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 82 INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, 83 "SI Fold Operands", false, false) 84 85 char SIFoldOperands::ID = 0; 86 87 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 88 89 FunctionPass *llvm::createSIFoldOperandsPass() { 90 return new SIFoldOperands(); 91 } 92 93 static bool isSafeToFold(unsigned Opcode) { 94 switch(Opcode) { 95 case AMDGPU::V_MOV_B32_e32: 96 case AMDGPU::V_MOV_B32_e64: 97 case AMDGPU::V_MOV_B64_PSEUDO: 98 case AMDGPU::S_MOV_B32: 99 case AMDGPU::S_MOV_B64: 100 case AMDGPU::COPY: 101 return true; 102 default: 103 return false; 104 } 105 } 106 107 static bool updateOperand(FoldCandidate &Fold, 108 const TargetRegisterInfo &TRI) { 109 MachineInstr *MI = Fold.UseMI; 110 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 111 assert(Old.isReg()); 112 113 if (Fold.isImm()) { 114 Old.ChangeToImmediate(Fold.ImmToFold); 115 return true; 116 } 117 118 MachineOperand *New = Fold.OpToFold; 119 if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && 120 TargetRegisterInfo::isVirtualRegister(New->getReg())) { 121 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); 122 return true; 123 } 124 125 // FIXME: Handle physical registers. 126 127 return false; 128 } 129 130 static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, 131 const MachineInstr *MI) { 132 for (auto Candidate : FoldList) { 133 if (Candidate.UseMI == MI) 134 return true; 135 } 136 return false; 137 } 138 139 static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, 140 MachineInstr *MI, unsigned OpNo, 141 MachineOperand *OpToFold, 142 const SIInstrInfo *TII) { 143 if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { 144 145 // Special case for v_mac_f32_e64 if we are trying to fold into src2 146 unsigned Opc = MI->getOpcode(); 147 if (Opc == AMDGPU::V_MAC_F32_e64 && 148 (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { 149 // Check if changing this to a v_mad_f32 instruction will allow us to 150 // fold the operand. 151 MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); 152 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); 153 if (FoldAsMAD) { 154 MI->untieRegOperand(OpNo); 155 return true; 156 } 157 MI->setDesc(TII->get(Opc)); 158 } 159 160 // If we are already folding into another operand of MI, then 161 // we can't commute the instruction, otherwise we risk making the 162 // other fold illegal. 163 if (isUseMIInFoldList(FoldList, MI)) 164 return false; 165 166 // Operand is not legal, so try to commute the instruction to 167 // see if this makes it possible to fold. 168 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; 169 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 170 bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); 171 172 if (CanCommute) { 173 if (CommuteIdx0 == OpNo) 174 OpNo = CommuteIdx1; 175 else if (CommuteIdx1 == OpNo) 176 OpNo = CommuteIdx0; 177 } 178 179 // One of operands might be an Imm operand, and OpNo may refer to it after 180 // the call of commuteInstruction() below. Such situations are avoided 181 // here explicitly as OpNo must be a register operand to be a candidate 182 // for memory folding. 183 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || 184 !MI->getOperand(CommuteIdx1).isReg())) 185 return false; 186 187 if (!CanCommute || 188 !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1)) 189 return false; 190 191 if (!TII->isOperandLegal(MI, OpNo, OpToFold)) 192 return false; 193 } 194 195 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 196 return true; 197 } 198 199 static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, 200 unsigned UseOpIdx, 201 std::vector<FoldCandidate> &FoldList, 202 SmallVectorImpl<MachineInstr *> &CopiesToReplace, 203 const SIInstrInfo *TII, const SIRegisterInfo &TRI, 204 MachineRegisterInfo &MRI) { 205 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 206 207 // FIXME: Fold operands with subregs. 208 if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || 209 UseOp.isImplicit())) { 210 return; 211 } 212 213 bool FoldingImm = OpToFold.isImm(); 214 APInt Imm; 215 216 if (FoldingImm) { 217 unsigned UseReg = UseOp.getReg(); 218 const TargetRegisterClass *UseRC 219 = TargetRegisterInfo::isVirtualRegister(UseReg) ? 220 MRI.getRegClass(UseReg) : 221 TRI.getPhysRegClass(UseReg); 222 223 Imm = APInt(64, OpToFold.getImm()); 224 225 const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); 226 const TargetRegisterClass *FoldRC = 227 TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); 228 229 // Split 64-bit constants into 32-bits for folding. 230 if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { 231 if (UseRC->getSize() != 8) 232 return; 233 234 if (UseOp.getSubReg() == AMDGPU::sub0) { 235 Imm = Imm.getLoBits(32); 236 } else { 237 assert(UseOp.getSubReg() == AMDGPU::sub1); 238 Imm = Imm.getHiBits(32); 239 } 240 } 241 242 // In order to fold immediates into copies, we need to change the 243 // copy to a MOV. 244 if (UseMI->getOpcode() == AMDGPU::COPY) { 245 unsigned DestReg = UseMI->getOperand(0).getReg(); 246 const TargetRegisterClass *DestRC 247 = TargetRegisterInfo::isVirtualRegister(DestReg) ? 248 MRI.getRegClass(DestReg) : 249 TRI.getPhysRegClass(DestReg); 250 251 unsigned MovOp = TII->getMovOpcode(DestRC); 252 if (MovOp == AMDGPU::COPY) 253 return; 254 255 UseMI->setDesc(TII->get(MovOp)); 256 CopiesToReplace.push_back(UseMI); 257 } 258 } 259 260 // Special case for REG_SEQUENCE: We can't fold literals into 261 // REG_SEQUENCE instructions, so we have to fold them into the 262 // uses of REG_SEQUENCE. 263 if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { 264 unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); 265 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 266 267 for (MachineRegisterInfo::use_iterator 268 RSUse = MRI.use_begin(RegSeqDstReg), 269 RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { 270 271 MachineInstr *RSUseMI = RSUse->getParent(); 272 if (RSUse->getSubReg() != RegSeqDstSubReg) 273 continue; 274 275 foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, 276 CopiesToReplace, TII, TRI, MRI); 277 } 278 return; 279 } 280 281 const MCInstrDesc &UseDesc = UseMI->getDesc(); 282 283 // Don't fold into target independent nodes. Target independent opcodes 284 // don't have defined register classes. 285 if (UseDesc.isVariadic() || 286 UseDesc.OpInfo[UseOpIdx].RegClass == -1) 287 return; 288 289 if (FoldingImm) { 290 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 291 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); 292 return; 293 } 294 295 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 296 297 // FIXME: We could try to change the instruction from 64-bit to 32-bit 298 // to enable more folding opportunites. The shrink operands pass 299 // already does this. 300 return; 301 } 302 303 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 304 MachineRegisterInfo &MRI = MF.getRegInfo(); 305 const SIInstrInfo *TII = 306 static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); 307 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 308 309 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 310 BI != BE; ++BI) { 311 312 MachineBasicBlock &MBB = *BI; 313 MachineBasicBlock::iterator I, Next; 314 for (I = MBB.begin(); I != MBB.end(); I = Next) { 315 Next = std::next(I); 316 MachineInstr &MI = *I; 317 318 if (!isSafeToFold(MI.getOpcode())) 319 continue; 320 321 unsigned OpSize = TII->getOpSize(MI, 1); 322 MachineOperand &OpToFold = MI.getOperand(1); 323 bool FoldingImm = OpToFold.isImm(); 324 325 // FIXME: We could also be folding things like FrameIndexes and 326 // TargetIndexes. 327 if (!FoldingImm && !OpToFold.isReg()) 328 continue; 329 330 // Folding immediates with more than one use will increase program size. 331 // FIXME: This will also reduce register usage, which may be better 332 // in some cases. A better heuristic is needed. 333 if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && 334 !MRI.hasOneUse(MI.getOperand(0).getReg())) 335 continue; 336 337 if (OpToFold.isReg() && 338 !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) 339 continue; 340 341 // Prevent folding operands backwards in the function. For example, 342 // the COPY opcode must not be replaced by 1 in this example: 343 // 344 // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 345 // ... 346 // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> 347 MachineOperand &Dst = MI.getOperand(0); 348 if (Dst.isReg() && 349 !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) 350 continue; 351 352 // We need mutate the operands of new mov instructions to add implicit 353 // uses of EXEC, but adding them invalidates the use_iterator, so defer 354 // this. 355 SmallVector<MachineInstr *, 4> CopiesToReplace; 356 357 std::vector<FoldCandidate> FoldList; 358 for (MachineRegisterInfo::use_iterator 359 Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); 360 Use != E; ++Use) { 361 362 MachineInstr *UseMI = Use->getParent(); 363 364 foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, 365 CopiesToReplace, TII, TRI, MRI); 366 } 367 368 // Make sure we add EXEC uses to any new v_mov instructions created. 369 for (MachineInstr *Copy : CopiesToReplace) 370 Copy->addImplicitDefUseOperands(MF); 371 372 for (FoldCandidate &Fold : FoldList) { 373 if (updateOperand(Fold, TRI)) { 374 // Clear kill flags. 375 if (!Fold.isImm()) { 376 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 377 // FIXME: Probably shouldn't bother trying to fold if not an 378 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 379 // copies. 380 MRI.clearKillFlags(Fold.OpToFold->getReg()); 381 } 382 DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << 383 Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); 384 } 385 } 386 } 387 } 388 return false; 389 } 390