1 //===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass implements instructions packetization for R600. It unsets isLast 12 /// bit of instructions inside a bundle and substitutes src register with 13 /// PreviousVector when applicable. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Support/Debug.h" 18 #include "AMDGPU.h" 19 #include "AMDGPUSubtarget.h" 20 #include "R600InstrInfo.h" 21 #include "llvm/CodeGen/DFAPacketizer.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFunctionPass.h" 24 #include "llvm/CodeGen/MachineLoopInfo.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/ScheduleDAG.h" 27 #include "llvm/Support/raw_ostream.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "packets" 32 33 namespace { 34 35 class R600Packetizer : public MachineFunctionPass { 36 37 public: 38 static char ID; 39 R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} 40 41 void getAnalysisUsage(AnalysisUsage &AU) const override { 42 AU.setPreservesCFG(); 43 AU.addRequired<MachineDominatorTree>(); 44 AU.addPreserved<MachineDominatorTree>(); 45 AU.addRequired<MachineLoopInfo>(); 46 AU.addPreserved<MachineLoopInfo>(); 47 MachineFunctionPass::getAnalysisUsage(AU); 48 } 49 50 const char *getPassName() const override { 51 return "R600 Packetizer"; 52 } 53 54 bool runOnMachineFunction(MachineFunction &Fn) override; 55 }; 56 char R600Packetizer::ID = 0; 57 58 class R600PacketizerList : public VLIWPacketizerList { 59 60 private: 61 const R600InstrInfo *TII; 62 const R600RegisterInfo &TRI; 63 bool VLIW5; 64 bool ConsideredInstUsesAlreadyWrittenVectorElement; 65 66 unsigned getSlot(const MachineInstr &MI) const { 67 return TRI.getHWRegChan(MI.getOperand(0).getReg()); 68 } 69 70 /// \returns register to PV chan mapping for bundle/single instructions that 71 /// immediately precedes I. 72 DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I) 73 const { 74 DenseMap<unsigned, unsigned> Result; 75 I--; 76 if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) 77 return Result; 78 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 79 if (I->isBundle()) 80 BI++; 81 int LastDstChan = -1; 82 do { 83 bool isTrans = false; 84 int BISlot = getSlot(*BI); 85 if (LastDstChan >= BISlot) 86 isTrans = true; 87 LastDstChan = BISlot; 88 if (TII->isPredicated(*BI)) 89 continue; 90 int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); 91 if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) 92 continue; 93 int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); 94 if (DstIdx == -1) { 95 continue; 96 } 97 unsigned Dst = BI->getOperand(DstIdx).getReg(); 98 if (isTrans || TII->isTransOnly(&*BI)) { 99 Result[Dst] = AMDGPU::PS; 100 continue; 101 } 102 if (BI->getOpcode() == AMDGPU::DOT4_r600 || 103 BI->getOpcode() == AMDGPU::DOT4_eg) { 104 Result[Dst] = AMDGPU::PV_X; 105 continue; 106 } 107 if (Dst == AMDGPU::OQAP) { 108 continue; 109 } 110 unsigned PVReg = 0; 111 switch (TRI.getHWRegChan(Dst)) { 112 case 0: 113 PVReg = AMDGPU::PV_X; 114 break; 115 case 1: 116 PVReg = AMDGPU::PV_Y; 117 break; 118 case 2: 119 PVReg = AMDGPU::PV_Z; 120 break; 121 case 3: 122 PVReg = AMDGPU::PV_W; 123 break; 124 default: 125 llvm_unreachable("Invalid Chan"); 126 } 127 Result[Dst] = PVReg; 128 } while ((++BI)->isBundledWithPred()); 129 return Result; 130 } 131 132 void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs) 133 const { 134 unsigned Ops[] = { 135 AMDGPU::OpName::src0, 136 AMDGPU::OpName::src1, 137 AMDGPU::OpName::src2 138 }; 139 for (unsigned i = 0; i < 3; i++) { 140 int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); 141 if (OperandIdx < 0) 142 continue; 143 unsigned Src = MI.getOperand(OperandIdx).getReg(); 144 const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); 145 if (It != PVs.end()) 146 MI.getOperand(OperandIdx).setReg(It->second); 147 } 148 } 149 public: 150 // Ctor. 151 R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) 152 : VLIWPacketizerList(MF, MLI, nullptr), 153 TII(static_cast<const R600InstrInfo *>( 154 MF.getSubtarget().getInstrInfo())), 155 TRI(TII->getRegisterInfo()) { 156 VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); 157 } 158 159 // initPacketizerState - initialize some internal flags. 160 void initPacketizerState() override { 161 ConsideredInstUsesAlreadyWrittenVectorElement = false; 162 } 163 164 // ignorePseudoInstruction - Ignore bundling of pseudo instructions. 165 bool ignorePseudoInstruction(const MachineInstr &MI, 166 const MachineBasicBlock *MBB) override { 167 return false; 168 } 169 170 // isSoloInstruction - return true if instruction MI can not be packetized 171 // with any other instruction, which means that MI itself is a packet. 172 bool isSoloInstruction(const MachineInstr &MI) override { 173 if (TII->isVector(MI)) 174 return true; 175 if (!TII->isALUInstr(MI.getOpcode())) 176 return true; 177 if (MI.getOpcode() == AMDGPU::GROUP_BARRIER) 178 return true; 179 // XXX: This can be removed once the packetizer properly handles all the 180 // LDS instruction group restrictions. 181 return TII->isLDSInstr(MI.getOpcode()); 182 } 183 184 // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ 185 // together. 186 bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { 187 MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); 188 if (getSlot(*MII) == getSlot(*MIJ)) 189 ConsideredInstUsesAlreadyWrittenVectorElement = true; 190 // Does MII and MIJ share the same pred_sel ? 191 int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), 192 OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); 193 unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, 194 PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; 195 if (PredI != PredJ) 196 return false; 197 if (SUJ->isSucc(SUI)) { 198 for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { 199 const SDep &Dep = SUJ->Succs[i]; 200 if (Dep.getSUnit() != SUI) 201 continue; 202 if (Dep.getKind() == SDep::Anti) 203 continue; 204 if (Dep.getKind() == SDep::Output) 205 if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) 206 continue; 207 return false; 208 } 209 } 210 211 bool ARDef = TII->definesAddressRegister(MII) || 212 TII->definesAddressRegister(MIJ); 213 bool ARUse = TII->usesAddressRegister(MII) || 214 TII->usesAddressRegister(MIJ); 215 216 return !ARDef || !ARUse; 217 } 218 219 // isLegalToPruneDependencies - Is it legal to prune dependece between SUI 220 // and SUJ. 221 bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { 222 return false; 223 } 224 225 void setIsLastBit(MachineInstr *MI, unsigned Bit) const { 226 unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); 227 MI->getOperand(LastOp).setImm(Bit); 228 } 229 230 bool isBundlableWithCurrentPMI(MachineInstr &MI, 231 const DenseMap<unsigned, unsigned> &PV, 232 std::vector<R600InstrInfo::BankSwizzle> &BS, 233 bool &isTransSlot) { 234 isTransSlot = TII->isTransOnly(&MI); 235 assert (!isTransSlot || VLIW5); 236 237 // Is the dst reg sequence legal ? 238 if (!isTransSlot && !CurrentPacketMIs.empty()) { 239 if (getSlot(MI) <= getSlot(*CurrentPacketMIs.back())) { 240 if (ConsideredInstUsesAlreadyWrittenVectorElement && 241 !TII->isVectorOnly(&MI) && VLIW5) { 242 isTransSlot = true; 243 DEBUG({ 244 dbgs() << "Considering as Trans Inst :"; 245 MI.dump(); 246 }); 247 } 248 else 249 return false; 250 } 251 } 252 253 // Are the Constants limitations met ? 254 CurrentPacketMIs.push_back(&MI); 255 if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { 256 DEBUG({ 257 dbgs() << "Couldn't pack :\n"; 258 MI.dump(); 259 dbgs() << "with the following packets :\n"; 260 for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { 261 CurrentPacketMIs[i]->dump(); 262 dbgs() << "\n"; 263 } 264 dbgs() << "because of Consts read limitations\n"; 265 }); 266 CurrentPacketMIs.pop_back(); 267 return false; 268 } 269 270 // Is there a BankSwizzle set that meet Read Port limitations ? 271 if (!TII->fitsReadPortLimitations(CurrentPacketMIs, 272 PV, BS, isTransSlot)) { 273 DEBUG({ 274 dbgs() << "Couldn't pack :\n"; 275 MI.dump(); 276 dbgs() << "with the following packets :\n"; 277 for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { 278 CurrentPacketMIs[i]->dump(); 279 dbgs() << "\n"; 280 } 281 dbgs() << "because of Read port limitations\n"; 282 }); 283 CurrentPacketMIs.pop_back(); 284 return false; 285 } 286 287 // We cannot read LDS source registrs from the Trans slot. 288 if (isTransSlot && TII->readsLDSSrcReg(&MI)) 289 return false; 290 291 CurrentPacketMIs.pop_back(); 292 return true; 293 } 294 295 MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override { 296 MachineBasicBlock::iterator FirstInBundle = 297 CurrentPacketMIs.empty() ? &MI : CurrentPacketMIs.front(); 298 const DenseMap<unsigned, unsigned> &PV = 299 getPreviousVector(FirstInBundle); 300 std::vector<R600InstrInfo::BankSwizzle> BS; 301 bool isTransSlot; 302 303 if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) { 304 for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { 305 MachineInstr *MI = CurrentPacketMIs[i]; 306 unsigned Op = TII->getOperandIdx(MI->getOpcode(), 307 AMDGPU::OpName::bank_swizzle); 308 MI->getOperand(Op).setImm(BS[i]); 309 } 310 unsigned Op = 311 TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle); 312 MI.getOperand(Op).setImm(BS.back()); 313 if (!CurrentPacketMIs.empty()) 314 setIsLastBit(CurrentPacketMIs.back(), 0); 315 substitutePV(MI, PV); 316 MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI); 317 if (isTransSlot) { 318 endPacket(std::next(It)->getParent(), std::next(It)); 319 } 320 return It; 321 } 322 endPacket(MI.getParent(), MI); 323 if (TII->isTransOnly(&MI)) 324 return MI; 325 return VLIWPacketizerList::addToPacket(MI); 326 } 327 }; 328 329 bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { 330 const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); 331 MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); 332 333 // Instantiate the packetizer. 334 R600PacketizerList Packetizer(Fn, MLI); 335 336 // DFA state table should not be empty. 337 assert(Packetizer.getResourceTracker() && "Empty DFA table!"); 338 339 if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty()) 340 return false; 341 342 // 343 // Loop over all basic blocks and remove KILL pseudo-instructions 344 // These instructions confuse the dependence analysis. Consider: 345 // D0 = ... (Insn 0) 346 // R0 = KILL R0, D0 (Insn 1) 347 // R0 = ... (Insn 2) 348 // Here, Insn 1 will result in the dependence graph not emitting an output 349 // dependence between Insn 0 and Insn 2. This can lead to incorrect 350 // packetization 351 // 352 for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); 353 MBB != MBBe; ++MBB) { 354 MachineBasicBlock::iterator End = MBB->end(); 355 MachineBasicBlock::iterator MI = MBB->begin(); 356 while (MI != End) { 357 if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || 358 (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { 359 MachineBasicBlock::iterator DeleteMI = MI; 360 ++MI; 361 MBB->erase(DeleteMI); 362 End = MBB->end(); 363 continue; 364 } 365 ++MI; 366 } 367 } 368 369 // Loop over all of the basic blocks. 370 for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); 371 MBB != MBBe; ++MBB) { 372 // Find scheduling regions and schedule / packetize each region. 373 unsigned RemainingCount = MBB->size(); 374 for(MachineBasicBlock::iterator RegionEnd = MBB->end(); 375 RegionEnd != MBB->begin();) { 376 // The next region starts above the previous region. Look backward in the 377 // instruction stream until we find the nearest boundary. 378 MachineBasicBlock::iterator I = RegionEnd; 379 for(;I != MBB->begin(); --I, --RemainingCount) { 380 if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn)) 381 break; 382 } 383 I = MBB->begin(); 384 385 // Skip empty scheduling regions. 386 if (I == RegionEnd) { 387 RegionEnd = std::prev(RegionEnd); 388 --RemainingCount; 389 continue; 390 } 391 // Skip regions with one instruction. 392 if (I == std::prev(RegionEnd)) { 393 RegionEnd = std::prev(RegionEnd); 394 continue; 395 } 396 397 Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd); 398 RegionEnd = I; 399 } 400 } 401 402 return true; 403 404 } 405 406 } // end anonymous namespace 407 408 llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { 409 return new R600Packetizer(tm); 410 } 411