xref: /llvm-project/llvm/lib/Target/X86/X86FastPreTileConfig.cpp (revision dfe43bd1ca46c59399b7cbbf81b09256232e27f9)
1496156acSLuo, Yuanke //===-- X86FastPreTileConfig.cpp - Fast Tile Register Configure------------===//
2496156acSLuo, Yuanke //
3496156acSLuo, Yuanke // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4496156acSLuo, Yuanke // See https://llvm.org/LICENSE.txt for license information.
5496156acSLuo, Yuanke // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6496156acSLuo, Yuanke //
7496156acSLuo, Yuanke //===----------------------------------------------------------------------===//
8496156acSLuo, Yuanke //
9496156acSLuo, Yuanke /// \file Pass to preconfig the shape of physical tile registers
10496156acSLuo, Yuanke /// It inserts ldtilecfg ahead of each group of tile registers. The algorithm
11496156acSLuo, Yuanke /// walk each instruction of basic block in reverse order. All the tile
12496156acSLuo, Yuanke /// registers that live out the basic block would be spilled and reloaded
13496156acSLuo, Yuanke /// before its user. It also check the depenedency of the shape to ensure
14496156acSLuo, Yuanke /// the shape is defined before ldtilecfg.
15496156acSLuo, Yuanke //
16496156acSLuo, Yuanke //===----------------------------------------------------------------------===//
17496156acSLuo, Yuanke 
18496156acSLuo, Yuanke #include "X86.h"
19496156acSLuo, Yuanke #include "X86InstrBuilder.h"
20496156acSLuo, Yuanke #include "X86MachineFunctionInfo.h"
21496156acSLuo, Yuanke #include "X86RegisterInfo.h"
22496156acSLuo, Yuanke #include "X86Subtarget.h"
23496156acSLuo, Yuanke #include "llvm/ADT/PostOrderIterator.h"
24496156acSLuo, Yuanke #include "llvm/ADT/Statistic.h"
25496156acSLuo, Yuanke #include "llvm/CodeGen/MachineFrameInfo.h"
26496156acSLuo, Yuanke #include "llvm/CodeGen/MachineFunctionPass.h"
27496156acSLuo, Yuanke #include "llvm/CodeGen/MachineInstr.h"
28496156acSLuo, Yuanke #include "llvm/CodeGen/MachineRegisterInfo.h"
29496156acSLuo, Yuanke #include "llvm/CodeGen/Passes.h"
30496156acSLuo, Yuanke #include "llvm/CodeGen/TargetInstrInfo.h"
31496156acSLuo, Yuanke #include "llvm/CodeGen/TargetRegisterInfo.h"
32496156acSLuo, Yuanke #include "llvm/Support/Debug.h"
33496156acSLuo, Yuanke 
34496156acSLuo, Yuanke using namespace llvm;
35496156acSLuo, Yuanke 
36496156acSLuo, Yuanke #define DEBUG_TYPE "fastpretileconfig"
37496156acSLuo, Yuanke 
38496156acSLuo, Yuanke STATISTIC(NumStores, "Number of stores added");
39496156acSLuo, Yuanke STATISTIC(NumLoads, "Number of loads added");
40496156acSLuo, Yuanke 
41496156acSLuo, Yuanke namespace {
42496156acSLuo, Yuanke 
43496156acSLuo, Yuanke class X86FastPreTileConfig : public MachineFunctionPass {
44496156acSLuo, Yuanke   MachineFunction *MF = nullptr;
45496156acSLuo, Yuanke   const X86Subtarget *ST = nullptr;
46496156acSLuo, Yuanke   const TargetInstrInfo *TII = nullptr;
47496156acSLuo, Yuanke   MachineRegisterInfo *MRI = nullptr;
48496156acSLuo, Yuanke   X86MachineFunctionInfo *X86FI = nullptr;
49496156acSLuo, Yuanke   MachineFrameInfo *MFI = nullptr;
50496156acSLuo, Yuanke   const TargetRegisterInfo *TRI = nullptr;
51496156acSLuo, Yuanke   MachineBasicBlock *MBB = nullptr;
52496156acSLuo, Yuanke   int CfgSS = -1;
53496156acSLuo, Yuanke   struct PHIInfo {
54496156acSLuo, Yuanke     Register Row;
55496156acSLuo, Yuanke     Register Col;
56496156acSLuo, Yuanke     Register StackAddr;
57496156acSLuo, Yuanke   };
58496156acSLuo, Yuanke   DenseMap<MachineInstr *, struct PHIInfo> VisitedPHIs;
59496156acSLuo, Yuanke 
60496156acSLuo, Yuanke   /// Maps virtual regs to the frame index where these values are spilled.
61496156acSLuo, Yuanke   IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
62496156acSLuo, Yuanke 
63496156acSLuo, Yuanke   /// Has a bit set for tile virtual register for which it was determined
64496156acSLuo, Yuanke   /// that it is alive across blocks.
65496156acSLuo, Yuanke   BitVector MayLiveAcrossBlocks;
66496156acSLuo, Yuanke 
67496156acSLuo, Yuanke   int getStackSpaceFor(Register VirtReg);
68496156acSLuo, Yuanke   void InitializeTileConfigStackSpace();
69496156acSLuo, Yuanke   bool mayLiveOut(Register VirtReg, MachineInstr *CfgMI);
70496156acSLuo, Yuanke   void spill(MachineBasicBlock::iterator Before, Register VirtReg, bool Kill);
71496156acSLuo, Yuanke   void reload(MachineBasicBlock::iterator UseMI, Register VirtReg,
72496156acSLuo, Yuanke               MachineOperand *RowMO, MachineOperand *ColMO);
73496156acSLuo, Yuanke   void canonicalizePHIs(MachineBasicBlock &MBB);
74496156acSLuo, Yuanke   void convertPHI(MachineBasicBlock *MBB, MachineInstr &PHI);
75496156acSLuo, Yuanke   void convertPHIs(MachineBasicBlock &MBB);
76496156acSLuo, Yuanke   bool configBasicBlock(MachineBasicBlock &MBB);
77496156acSLuo, Yuanke 
78496156acSLuo, Yuanke public:
79496156acSLuo, Yuanke   X86FastPreTileConfig() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {}
80496156acSLuo, Yuanke 
81496156acSLuo, Yuanke   /// Return the pass name.
82496156acSLuo, Yuanke   StringRef getPassName() const override {
83496156acSLuo, Yuanke     return "Fast Tile Register Preconfigure";
84496156acSLuo, Yuanke   }
85496156acSLuo, Yuanke 
86496156acSLuo, Yuanke   /// Perform tile register configure.
87496156acSLuo, Yuanke   bool runOnMachineFunction(MachineFunction &MFunc) override;
88496156acSLuo, Yuanke 
89496156acSLuo, Yuanke   static char ID;
90496156acSLuo, Yuanke };
91496156acSLuo, Yuanke 
92496156acSLuo, Yuanke } // end anonymous namespace
93496156acSLuo, Yuanke 
94496156acSLuo, Yuanke char X86FastPreTileConfig::ID = 0;
95496156acSLuo, Yuanke 
96496156acSLuo, Yuanke INITIALIZE_PASS_BEGIN(X86FastPreTileConfig, DEBUG_TYPE,
97496156acSLuo, Yuanke                       "Fast Tile Register Preconfigure", false, false)
98496156acSLuo, Yuanke INITIALIZE_PASS_END(X86FastPreTileConfig, DEBUG_TYPE,
99496156acSLuo, Yuanke                     "Fast Tile Register Preconfigure", false, false)
100496156acSLuo, Yuanke 
101496156acSLuo, Yuanke static bool dominates(MachineBasicBlock &MBB,
102496156acSLuo, Yuanke                       MachineBasicBlock::const_iterator A,
103496156acSLuo, Yuanke                       MachineBasicBlock::const_iterator B) {
104496156acSLuo, Yuanke   auto MBBEnd = MBB.end();
105496156acSLuo, Yuanke   if (B == MBBEnd)
106496156acSLuo, Yuanke     return true;
107496156acSLuo, Yuanke 
108496156acSLuo, Yuanke   MachineBasicBlock::const_iterator I = MBB.begin();
109496156acSLuo, Yuanke   for (; &*I != A && &*I != B; ++I)
110496156acSLuo, Yuanke     ;
111496156acSLuo, Yuanke 
112496156acSLuo, Yuanke   return &*I == A;
113496156acSLuo, Yuanke }
114496156acSLuo, Yuanke 
115496156acSLuo, Yuanke /// This allocates space for the specified virtual register to be held on the
116496156acSLuo, Yuanke /// stack.
117496156acSLuo, Yuanke int X86FastPreTileConfig::getStackSpaceFor(Register VirtReg) {
118496156acSLuo, Yuanke   // Find the location Reg would belong...
119496156acSLuo, Yuanke   int SS = StackSlotForVirtReg[VirtReg];
120496156acSLuo, Yuanke   // Already has space allocated?
121496156acSLuo, Yuanke   if (SS != -1)
122496156acSLuo, Yuanke     return SS;
123496156acSLuo, Yuanke 
124496156acSLuo, Yuanke   // Allocate a new stack object for this spill location...
125496156acSLuo, Yuanke   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
126496156acSLuo, Yuanke   unsigned Size = TRI->getSpillSize(RC);
127496156acSLuo, Yuanke   Align Alignment = TRI->getSpillAlign(RC);
128496156acSLuo, Yuanke   int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment);
129496156acSLuo, Yuanke 
130496156acSLuo, Yuanke   // Assign the slot.
131496156acSLuo, Yuanke   StackSlotForVirtReg[VirtReg] = FrameIdx;
132496156acSLuo, Yuanke   return FrameIdx;
133496156acSLuo, Yuanke }
134496156acSLuo, Yuanke 
135496156acSLuo, Yuanke /// Returns false if \p VirtReg is known to not live out of the current config.
136496156acSLuo, Yuanke /// If \p VirtReg live out of the current MBB, it must live out of the current
137496156acSLuo, Yuanke /// config
138496156acSLuo, Yuanke bool X86FastPreTileConfig::mayLiveOut(Register VirtReg, MachineInstr *CfgMI) {
139496156acSLuo, Yuanke   if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg)))
140496156acSLuo, Yuanke     return true;
141496156acSLuo, Yuanke 
142496156acSLuo, Yuanke   for (const MachineInstr &UseInst : MRI->use_nodbg_instructions(VirtReg)) {
143496156acSLuo, Yuanke     if (UseInst.getParent() != MBB) {
144496156acSLuo, Yuanke       MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
145496156acSLuo, Yuanke       return true;
146496156acSLuo, Yuanke     }
147496156acSLuo, Yuanke 
148496156acSLuo, Yuanke     // The use and def are in the same MBB. If the tile register is
149496156acSLuo, Yuanke     // reconfigured, it is crobbered and we need to spill and reload
150496156acSLuo, Yuanke     // tile register.
151496156acSLuo, Yuanke     if (CfgMI) {
152496156acSLuo, Yuanke       if (dominates(*MBB, *CfgMI, UseInst)) {
153496156acSLuo, Yuanke         MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
154496156acSLuo, Yuanke         return true;
155496156acSLuo, Yuanke       }
156496156acSLuo, Yuanke     }
157496156acSLuo, Yuanke   }
158496156acSLuo, Yuanke 
159496156acSLuo, Yuanke   return false;
160496156acSLuo, Yuanke }
161496156acSLuo, Yuanke 
162496156acSLuo, Yuanke void X86FastPreTileConfig::InitializeTileConfigStackSpace() {
163496156acSLuo, Yuanke   MachineBasicBlock &MBB = MF->front();
164496156acSLuo, Yuanke   MachineInstr *MI = &*MBB.getFirstNonPHI();
165496156acSLuo, Yuanke   DebugLoc DL;
166496156acSLuo, Yuanke   if (ST->hasAVX512()) {
167496156acSLuo, Yuanke     Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
168496156acSLuo, Yuanke     BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm);
169496156acSLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), CfgSS)
170496156acSLuo, Yuanke         .addReg(Zmm);
171496156acSLuo, Yuanke   } else if (ST->hasAVX2()) {
172496156acSLuo, Yuanke     Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
173496156acSLuo, Yuanke     BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm);
174496156acSLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS)
175496156acSLuo, Yuanke         .addReg(Ymm);
176496156acSLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS,
177496156acSLuo, Yuanke                       32)
178496156acSLuo, Yuanke         .addReg(Ymm);
179496156acSLuo, Yuanke   } else {
180496156acSLuo, Yuanke     assert(ST->hasSSE2() && "AMX should assume SSE2 enabled");
181496156acSLuo, Yuanke     unsigned StoreOpc = ST->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
182496156acSLuo, Yuanke     Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
183496156acSLuo, Yuanke     BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm);
184496156acSLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS)
185496156acSLuo, Yuanke         .addReg(Xmm);
186496156acSLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 16)
187496156acSLuo, Yuanke         .addReg(Xmm);
188496156acSLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 32)
189496156acSLuo, Yuanke         .addReg(Xmm);
190496156acSLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 48)
191496156acSLuo, Yuanke         .addReg(Xmm);
192496156acSLuo, Yuanke   }
193496156acSLuo, Yuanke   // Fill in the palette first.
194496156acSLuo, Yuanke   addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), CfgSS)
195496156acSLuo, Yuanke       .addImm(1);
196496156acSLuo, Yuanke }
197496156acSLuo, Yuanke 
198496156acSLuo, Yuanke /// Insert spill instruction for \p AssignedReg before \p Before.
199496156acSLuo, Yuanke /// TODO: Update DBG_VALUEs with \p VirtReg operands with the stack slot.
200496156acSLuo, Yuanke void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before,
201496156acSLuo, Yuanke                                  Register VirtReg, bool Kill) {
202496156acSLuo, Yuanke   LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI) << " \n");
203496156acSLuo, Yuanke   int FI = getStackSpaceFor(VirtReg);
204496156acSLuo, Yuanke   LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
205496156acSLuo, Yuanke 
206496156acSLuo, Yuanke   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
207496156acSLuo, Yuanke   // Don't need shape information for tile store, becasue it is adjacent to
208496156acSLuo, Yuanke   // the tile def instruction.
209b5efec4bSChristudasan Devadasan   TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI,
210b5efec4bSChristudasan Devadasan                            Register());
211496156acSLuo, Yuanke   ++NumStores;
212496156acSLuo, Yuanke 
213496156acSLuo, Yuanke   // TODO: update DBG_VALUEs
214496156acSLuo, Yuanke }
215496156acSLuo, Yuanke 
216496156acSLuo, Yuanke /// Insert reload instruction for \p PhysReg before \p Before.
217496156acSLuo, Yuanke void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
218496156acSLuo, Yuanke                                   Register OrigReg, MachineOperand *RowMO,
219496156acSLuo, Yuanke                                   MachineOperand *ColMO) {
220496156acSLuo, Yuanke   int FI = getStackSpaceFor(OrigReg);
221496156acSLuo, Yuanke   const TargetRegisterClass &RC = *MRI->getRegClass(OrigReg);
222496156acSLuo, Yuanke   Register TileReg;
223496156acSLuo, Yuanke   // Fold copy to tileload
224496156acSLuo, Yuanke   // BB1:
225496156acSLuo, Yuanke   // spill src to s
226496156acSLuo, Yuanke   //
227496156acSLuo, Yuanke   // BB2:
228496156acSLuo, Yuanke   // t = copy src
229496156acSLuo, Yuanke   // -->
230496156acSLuo, Yuanke   // t = tileload (s)
231496156acSLuo, Yuanke   if (UseMI->isCopy())
232496156acSLuo, Yuanke     TileReg = UseMI->getOperand(0).getReg();
233496156acSLuo, Yuanke   else
234496156acSLuo, Yuanke     TileReg = MRI->createVirtualRegister(&RC);
235496156acSLuo, Yuanke   // Can't use TII->loadRegFromStackSlot(), because we need the shape
236496156acSLuo, Yuanke   // information for reload.
237496156acSLuo, Yuanke   // tileloadd (%sp, %idx), %tmm
238496156acSLuo, Yuanke   unsigned Opc = X86::PTILELOADDV;
239496156acSLuo, Yuanke   Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
240496156acSLuo, Yuanke   // FIXME: MBB is not the parent of UseMI.
241496156acSLuo, Yuanke   MachineInstr *NewMI = BuildMI(*UseMI->getParent(), UseMI, DebugLoc(),
242496156acSLuo, Yuanke                                 TII->get(X86::MOV64ri), StrideReg)
243496156acSLuo, Yuanke                             .addImm(64);
244496156acSLuo, Yuanke   NewMI = addFrameReference(
245496156acSLuo, Yuanke       BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), TII->get(Opc), TileReg)
246496156acSLuo, Yuanke           .addReg(RowMO->getReg())
247496156acSLuo, Yuanke           .addReg(ColMO->getReg()),
248496156acSLuo, Yuanke       FI);
249496156acSLuo, Yuanke   MachineOperand &MO = NewMI->getOperand(5);
250496156acSLuo, Yuanke   MO.setReg(StrideReg);
251496156acSLuo, Yuanke   MO.setIsKill(true);
252496156acSLuo, Yuanke   RowMO->setIsKill(false);
253496156acSLuo, Yuanke   ColMO->setIsKill(false);
254496156acSLuo, Yuanke   // Erase copy instruction after it is folded.
255496156acSLuo, Yuanke   if (UseMI->isCopy()) {
256496156acSLuo, Yuanke     UseMI->eraseFromParent();
257496156acSLuo, Yuanke   } else {
258496156acSLuo, Yuanke     // Replace the register in the user MI.
259496156acSLuo, Yuanke     for (auto &MO : UseMI->operands()) {
260496156acSLuo, Yuanke       if (MO.isReg() && MO.getReg() == OrigReg)
261496156acSLuo, Yuanke         MO.setReg(TileReg);
262496156acSLuo, Yuanke     }
263496156acSLuo, Yuanke   }
264496156acSLuo, Yuanke 
265496156acSLuo, Yuanke   ++NumLoads;
266496156acSLuo, Yuanke   LLVM_DEBUG(dbgs() << "Reloading " << printReg(OrigReg, TRI) << " into "
267496156acSLuo, Yuanke                     << printReg(TileReg, TRI) << '\n');
268496156acSLuo, Yuanke }
269496156acSLuo, Yuanke 
270*c72a751dSPhoebe Wang static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) {
271*c72a751dSPhoebe Wang   if (Reg.isVirtual()) {
272*c72a751dSPhoebe Wang     unsigned RegClassID = MRI->getRegClass(Reg)->getID();
273*c72a751dSPhoebe Wang     if (RegClassID == X86::TILERegClassID)
274*c72a751dSPhoebe Wang       return 1;
275*c72a751dSPhoebe Wang     if (RegClassID == X86::TILEPAIRRegClassID)
276*c72a751dSPhoebe Wang       return 2;
277*c72a751dSPhoebe Wang   } else {
278*c72a751dSPhoebe Wang     if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
279*c72a751dSPhoebe Wang       return 1;
280*c72a751dSPhoebe Wang     if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
281*c72a751dSPhoebe Wang       return 2;
282*c72a751dSPhoebe Wang   }
283*c72a751dSPhoebe Wang   return 0;
284*c72a751dSPhoebe Wang }
285*c72a751dSPhoebe Wang 
286*c72a751dSPhoebe Wang static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) {
287*c72a751dSPhoebe Wang   return getTileDefNum(MRI, VirtReg) > 0;
288*c72a751dSPhoebe Wang }
289*c72a751dSPhoebe Wang 
290496156acSLuo, Yuanke static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
291496156acSLuo, Yuanke   // The instruction must have 3 operands: tile def, row, col.
292496156acSLuo, Yuanke   if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo())
293496156acSLuo, Yuanke     return false;
294496156acSLuo, Yuanke   MachineOperand &MO = MI.getOperand(0);
295496156acSLuo, Yuanke 
296*c72a751dSPhoebe Wang   if (!MO.isReg())
297496156acSLuo, Yuanke     return false;
298*c72a751dSPhoebe Wang 
299*c72a751dSPhoebe Wang   return getTileDefNum(MRI, MO.getReg()) > 0;
300496156acSLuo, Yuanke }
301496156acSLuo, Yuanke 
302496156acSLuo, Yuanke static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) {
303496156acSLuo, Yuanke   MachineInstr *MI = MRI->getVRegDef(TileReg);
304496156acSLuo, Yuanke   if (isTileDef(MRI, *MI)) {
305496156acSLuo, Yuanke     MachineOperand *RowMO = &MI->getOperand(1);
306496156acSLuo, Yuanke     MachineOperand *ColMO = &MI->getOperand(2);
307496156acSLuo, Yuanke     return ShapeT(RowMO, ColMO, MRI);
308496156acSLuo, Yuanke   } else if (MI->isCopy()) {
309496156acSLuo, Yuanke     TileReg = MI->getOperand(1).getReg();
310496156acSLuo, Yuanke     return getShape(MRI, TileReg);
311496156acSLuo, Yuanke   }
312496156acSLuo, Yuanke 
313496156acSLuo, Yuanke   // The def should not be PHI node, because we walk the MBB in reverse post
314496156acSLuo, Yuanke   // order.
315496156acSLuo, Yuanke   assert(MI->isPHI() && "Unexpected PHI when get shape.");
316496156acSLuo, Yuanke   llvm_unreachable("Unexpected MI when get shape.");
317496156acSLuo, Yuanke }
318496156acSLuo, Yuanke 
319496156acSLuo, Yuanke // BB0:
320496156acSLuo, Yuanke // spill t0 to s0
321496156acSLuo, Yuanke // BB1:
322496156acSLuo, Yuanke // spill t1 to s1
323496156acSLuo, Yuanke //
324496156acSLuo, Yuanke // BB2:
325496156acSLuo, Yuanke // t = phi [t0, bb0] [t1, bb1]
326496156acSLuo, Yuanke // -->
327496156acSLuo, Yuanke // row = phi [r0, bb0] [r1, bb1]
328496156acSLuo, Yuanke // col = phi [c0, bb0] [c1, bb1]
329496156acSLuo, Yuanke //   s = phi [s0, bb0] [s1, bb1]
330496156acSLuo, Yuanke //   t = tileload row, col, s
331496156acSLuo, Yuanke // The new instruction is inserted at the end of the phi node. The order
332496156acSLuo, Yuanke // of the original phi node is not ensured.
333496156acSLuo, Yuanke void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB,
334496156acSLuo, Yuanke                                       MachineInstr &PHI) {
335496156acSLuo, Yuanke   // 1. Create instruction to get stack slot address of each incoming block.
336496156acSLuo, Yuanke   // 2. Create PHI node for the stack address.
337496156acSLuo, Yuanke   // 3. Create PHI node for shape. If one of the incoming shape is immediate
338496156acSLuo, Yuanke   //    use the immediate and delete the PHI node.
339496156acSLuo, Yuanke   // 4. Create tileload instruction from the stack address.
340496156acSLuo, Yuanke   Register StackAddrReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
341496156acSLuo, Yuanke   MachineInstrBuilder AddrPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
342496156acSLuo, Yuanke                                         TII->get(X86::PHI), StackAddrReg);
343496156acSLuo, Yuanke   Register RowReg = MRI->createVirtualRegister(&X86::GR16RegClass);
344496156acSLuo, Yuanke   MachineInstrBuilder RowPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
345496156acSLuo, Yuanke                                        TII->get(X86::PHI), RowReg);
346496156acSLuo, Yuanke   Register ColReg = MRI->createVirtualRegister(&X86::GR16RegClass);
347496156acSLuo, Yuanke   MachineInstrBuilder ColPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
348496156acSLuo, Yuanke                                        TII->get(X86::PHI), ColReg);
349496156acSLuo, Yuanke   // Record the mapping of phi node and its row/column information.
350496156acSLuo, Yuanke   VisitedPHIs[&PHI] = {RowReg, ColReg, StackAddrReg};
351496156acSLuo, Yuanke 
352496156acSLuo, Yuanke   for (unsigned I = 1, E = PHI.getNumOperands(); I != E; I += 2) {
353496156acSLuo, Yuanke     // Get the 2 incoming value of tile register and MBB.
354496156acSLuo, Yuanke     Register InTileReg = PHI.getOperand(I).getReg();
355496156acSLuo, Yuanke     // Mark it as liveout, so that it will be spilled when visit
356496156acSLuo, Yuanke     // the incoming MBB. Otherwise since phi will be deleted, it
357496156acSLuo, Yuanke     // would miss spill when visit incoming MBB.
358496156acSLuo, Yuanke     MayLiveAcrossBlocks.set(Register::virtReg2Index(InTileReg));
359496156acSLuo, Yuanke     MachineBasicBlock *InMBB = PHI.getOperand(I + 1).getMBB();
360496156acSLuo, Yuanke 
361496156acSLuo, Yuanke     MachineInstr *TileDefMI = MRI->getVRegDef(InTileReg);
362496156acSLuo, Yuanke     MachineBasicBlock::iterator InsertPos;
363496156acSLuo, Yuanke     if (TileDefMI->isPHI()) {
364496156acSLuo, Yuanke       InsertPos = TileDefMI->getParent()->getFirstNonPHI();
365496156acSLuo, Yuanke       if (VisitedPHIs.count(TileDefMI)) { // circular phi reference
366496156acSLuo, Yuanke         //        def t1
367496156acSLuo, Yuanke         //       /       \
368496156acSLuo, Yuanke         //  def t2       t3 = phi(t1, t4) <--
369496156acSLuo, Yuanke         //       \       /                  |
370496156acSLuo, Yuanke         //      t4 = phi(t2, t3)-------------
371496156acSLuo, Yuanke         //
372496156acSLuo, Yuanke         // For each (row, column and stack address) append phi incoming value.
373496156acSLuo, Yuanke         // Create r3 = phi(r1, r4)
374496156acSLuo, Yuanke         // Create r4 = phi(r2, r3)
375496156acSLuo, Yuanke         Register InRowReg = VisitedPHIs[TileDefMI].Row;
376496156acSLuo, Yuanke         Register InColReg = VisitedPHIs[TileDefMI].Col;
377496156acSLuo, Yuanke         Register InStackAddrReg = VisitedPHIs[TileDefMI].StackAddr;
378496156acSLuo, Yuanke         RowPHI.addReg(InRowReg).addMBB(InMBB);
379496156acSLuo, Yuanke         ColPHI.addReg(InColReg).addMBB(InMBB);
380496156acSLuo, Yuanke         AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
381496156acSLuo, Yuanke         continue;
382496156acSLuo, Yuanke       } else {
383496156acSLuo, Yuanke         // Recursively convert PHI to tileload
384496156acSLuo, Yuanke         convertPHI(TileDefMI->getParent(), *TileDefMI);
385496156acSLuo, Yuanke         // The PHI node is coverted to tileload instruction. Get the stack
386496156acSLuo, Yuanke         // address from tileload operands.
387496156acSLuo, Yuanke         MachineInstr *TileLoad = MRI->getVRegDef(InTileReg);
38854ec8e25SLuo, Yuanke         assert(TileLoad && TileLoad->getOpcode() == X86::PTILELOADDV);
389496156acSLuo, Yuanke         Register InRowReg = TileLoad->getOperand(1).getReg();
390496156acSLuo, Yuanke         Register InColReg = TileLoad->getOperand(2).getReg();
391496156acSLuo, Yuanke         Register InStackAddrReg = TileLoad->getOperand(3).getReg();
392496156acSLuo, Yuanke         RowPHI.addReg(InRowReg).addMBB(InMBB);
393496156acSLuo, Yuanke         ColPHI.addReg(InColReg).addMBB(InMBB);
394496156acSLuo, Yuanke         AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
395496156acSLuo, Yuanke       }
396496156acSLuo, Yuanke     } else {
397496156acSLuo, Yuanke       InsertPos = TileDefMI->getIterator();
398496156acSLuo, Yuanke 
399496156acSLuo, Yuanke       // Fill the incoming operand of row/column phi instruction.
400496156acSLuo, Yuanke       ShapeT Shape = getShape(MRI, InTileReg);
401496156acSLuo, Yuanke       Shape.getRow()->setIsKill(false);
402496156acSLuo, Yuanke       Shape.getCol()->setIsKill(false);
403496156acSLuo, Yuanke       RowPHI.addReg(Shape.getRow()->getReg()).addMBB(InMBB);
404496156acSLuo, Yuanke       ColPHI.addReg(Shape.getCol()->getReg()).addMBB(InMBB);
405496156acSLuo, Yuanke 
406496156acSLuo, Yuanke       // The incoming tile register live out of its def BB, it would be spilled.
407496156acSLuo, Yuanke       // Create MI to get the spill stack slot address for the tile register
408496156acSLuo, Yuanke       int FI = getStackSpaceFor(InTileReg);
409496156acSLuo, Yuanke       Register InStackAddrReg =
410496156acSLuo, Yuanke           MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
411496156acSLuo, Yuanke       addOffset(BuildMI(*TileDefMI->getParent(), InsertPos, DebugLoc(),
412496156acSLuo, Yuanke                         TII->get(X86::LEA64r), InStackAddrReg)
413496156acSLuo, Yuanke                     .addFrameIndex(FI),
414496156acSLuo, Yuanke                 0);
415496156acSLuo, Yuanke       AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
416496156acSLuo, Yuanke     }
417496156acSLuo, Yuanke   }
418496156acSLuo, Yuanke 
419496156acSLuo, Yuanke   MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI();
420496156acSLuo, Yuanke   Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
421496156acSLuo, Yuanke   BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::MOV64ri), StrideReg)
422496156acSLuo, Yuanke       .addImm(64);
423496156acSLuo, Yuanke   Register TileReg = PHI.getOperand(0).getReg();
424496156acSLuo, Yuanke   MachineInstr *NewMI = addDirectMem(
425496156acSLuo, Yuanke       BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::PTILELOADDV), TileReg)
426496156acSLuo, Yuanke           .addReg(RowReg)
427496156acSLuo, Yuanke           .addReg(ColReg),
428496156acSLuo, Yuanke       StackAddrReg);
429496156acSLuo, Yuanke   MachineOperand &MO = NewMI->getOperand(5);
430496156acSLuo, Yuanke   MO.setReg(StrideReg);
431496156acSLuo, Yuanke   MO.setIsKill(true);
432496156acSLuo, Yuanke   PHI.eraseFromParent();
433496156acSLuo, Yuanke   VisitedPHIs.erase(&PHI);
434496156acSLuo, Yuanke }
435496156acSLuo, Yuanke 
436496156acSLuo, Yuanke static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
437496156acSLuo, Yuanke   MachineOperand &MO = MI.getOperand(0);
438*c72a751dSPhoebe Wang   if (MO.isReg() && MO.getReg().isVirtual() && isTileRegister(MRI, MO.getReg()))
439496156acSLuo, Yuanke     return true;
440496156acSLuo, Yuanke   return false;
441496156acSLuo, Yuanke }
442496156acSLuo, Yuanke 
443496156acSLuo, Yuanke void X86FastPreTileConfig::canonicalizePHIs(MachineBasicBlock &MBB) {
444496156acSLuo, Yuanke   SmallVector<MachineInstr *, 8> PHIs;
445496156acSLuo, Yuanke 
446496156acSLuo, Yuanke   for (MachineInstr &MI : MBB) {
447496156acSLuo, Yuanke     if (!MI.isPHI())
448496156acSLuo, Yuanke       break;
449496156acSLuo, Yuanke     if (!isTileRegDef(MRI, MI))
450496156acSLuo, Yuanke       continue;
451496156acSLuo, Yuanke     PHIs.push_back(&MI);
452496156acSLuo, Yuanke   }
453496156acSLuo, Yuanke   // Canonicalize the phi node first. One tile phi may depeneds previous
454496156acSLuo, Yuanke   // phi node. For below case, we need convert %t4.
455496156acSLuo, Yuanke   //
456496156acSLuo, Yuanke   // BB0:
457496156acSLuo, Yuanke   // %t3 = phi (t1 BB1, t2 BB0)
458496156acSLuo, Yuanke   // %t4 = phi (t5 BB1, t3 BB0)
459496156acSLuo, Yuanke   // -->
460496156acSLuo, Yuanke   // %t3 = phi (t1 BB1, t2 BB0)
461496156acSLuo, Yuanke   // %t4 = phi (t5 BB1, t2 BB0)
462496156acSLuo, Yuanke   //
463496156acSLuo, Yuanke   while (!PHIs.empty()) {
464496156acSLuo, Yuanke     MachineInstr *PHI = PHIs.pop_back_val();
465496156acSLuo, Yuanke 
466496156acSLuo, Yuanke     // Find the operand that is incoming from the same MBB and the def
467496156acSLuo, Yuanke     // is also phi node.
468496156acSLuo, Yuanke     MachineOperand *InMO = nullptr;
469496156acSLuo, Yuanke     MachineInstr *DefMI = nullptr;
470496156acSLuo, Yuanke     for (unsigned I = 1, E = PHI->getNumOperands(); I != E; I += 2) {
471496156acSLuo, Yuanke       Register InTileReg = PHI->getOperand(I).getReg();
472496156acSLuo, Yuanke       MachineBasicBlock *InMBB = PHI->getOperand(I + 1).getMBB();
473496156acSLuo, Yuanke       DefMI = MRI->getVRegDef(InTileReg);
474496156acSLuo, Yuanke       if (InMBB != &MBB || !DefMI->isPHI())
475496156acSLuo, Yuanke         continue;
476496156acSLuo, Yuanke 
477496156acSLuo, Yuanke       InMO = &PHI->getOperand(I);
478496156acSLuo, Yuanke       break;
479496156acSLuo, Yuanke     }
480496156acSLuo, Yuanke     // If can't find such operand, do nothing.
481496156acSLuo, Yuanke     if (!InMO)
482496156acSLuo, Yuanke       continue;
483496156acSLuo, Yuanke 
484496156acSLuo, Yuanke     // Current phi node depends on previous phi node. Break the
485496156acSLuo, Yuanke     // dependency.
486496156acSLuo, Yuanke     Register DefTileReg;
487496156acSLuo, Yuanke     for (unsigned I = 1, E = DefMI->getNumOperands(); I != E; I += 2) {
488496156acSLuo, Yuanke       MachineBasicBlock *InMBB = PHI->getOperand(I + 1).getMBB();
489496156acSLuo, Yuanke       if (InMBB != &MBB)
490496156acSLuo, Yuanke         continue;
491496156acSLuo, Yuanke       DefTileReg = DefMI->getOperand(I).getReg();
492496156acSLuo, Yuanke       InMO->setReg(DefTileReg);
493496156acSLuo, Yuanke       break;
494496156acSLuo, Yuanke     }
495496156acSLuo, Yuanke   }
496496156acSLuo, Yuanke }
497496156acSLuo, Yuanke 
498496156acSLuo, Yuanke void X86FastPreTileConfig::convertPHIs(MachineBasicBlock &MBB) {
499496156acSLuo, Yuanke   SmallVector<MachineInstr *, 8> PHIs;
500496156acSLuo, Yuanke   for (MachineInstr &MI : MBB) {
501496156acSLuo, Yuanke     if (!MI.isPHI())
502496156acSLuo, Yuanke       break;
503496156acSLuo, Yuanke     if (!isTileRegDef(MRI, MI))
504496156acSLuo, Yuanke       continue;
505496156acSLuo, Yuanke     PHIs.push_back(&MI);
506496156acSLuo, Yuanke   }
507496156acSLuo, Yuanke   while (!PHIs.empty()) {
508496156acSLuo, Yuanke     MachineInstr *MI = PHIs.pop_back_val();
509496156acSLuo, Yuanke     VisitedPHIs.clear();
510496156acSLuo, Yuanke     convertPHI(&MBB, *MI);
511496156acSLuo, Yuanke   }
512496156acSLuo, Yuanke }
513496156acSLuo, Yuanke 
514496156acSLuo, Yuanke // PreTileConfig should configure the tile registers based on basic
515496156acSLuo, Yuanke // block.
516496156acSLuo, Yuanke bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
517496156acSLuo, Yuanke   this->MBB = &MBB;
518496156acSLuo, Yuanke   bool Change = false;
519496156acSLuo, Yuanke   MachineInstr *LastShapeMI = nullptr;
520496156acSLuo, Yuanke   MachineInstr *LastTileCfg = nullptr;
521496156acSLuo, Yuanke   bool HasUnconfigTile = false;
522496156acSLuo, Yuanke 
523496156acSLuo, Yuanke   auto Config = [&](MachineInstr &Before) {
524496156acSLuo, Yuanke     if (CfgSS == -1)
525496156acSLuo, Yuanke       CfgSS = MFI->CreateStackObject(ST->getTileConfigSize(),
526496156acSLuo, Yuanke                                      ST->getTileConfigAlignment(), false);
527496156acSLuo, Yuanke     LastTileCfg = addFrameReference(
528aaaf9cedSLuo, Yuanke         BuildMI(MBB, Before, DebugLoc(), TII->get(X86::PLDTILECFGV)), CfgSS);
529496156acSLuo, Yuanke     LastShapeMI = nullptr;
530496156acSLuo, Yuanke     Change = true;
531496156acSLuo, Yuanke   };
532496156acSLuo, Yuanke   auto HasTileOperand = [](MachineRegisterInfo *MRI, MachineInstr &MI) {
533496156acSLuo, Yuanke     for (const MachineOperand &MO : MI.operands()) {
534496156acSLuo, Yuanke       if (!MO.isReg())
535496156acSLuo, Yuanke         continue;
536496156acSLuo, Yuanke       Register Reg = MO.getReg();
537*c72a751dSPhoebe Wang       if (Reg.isVirtual() && isTileRegister(MRI, Reg))
538496156acSLuo, Yuanke         return true;
539496156acSLuo, Yuanke     }
540496156acSLuo, Yuanke     return false;
541496156acSLuo, Yuanke   };
542496156acSLuo, Yuanke   for (MachineInstr &MI : reverse(MBB)) {
543496156acSLuo, Yuanke     // We have transformed phi node before configuring BB.
544496156acSLuo, Yuanke     if (MI.isPHI())
545496156acSLuo, Yuanke       break;
546496156acSLuo, Yuanke     // Don't collect the shape of used tile, the tile should be defined
547496156acSLuo, Yuanke     // before the tile use. Spill and reload would happen if there is only
548496156acSLuo, Yuanke     // tile use after ldtilecfg, so the shape can be collected from reload.
549496156acSLuo, Yuanke     // Take below code for example. %t would be reloaded before tilestore
550496156acSLuo, Yuanke     // call
551496156acSLuo, Yuanke     // ....
552496156acSLuo, Yuanke     // tilestore %r, %c, %t
553496156acSLuo, Yuanke     // -->
554496156acSLuo, Yuanke     // call
555496156acSLuo, Yuanke     // ldtilecfg
556496156acSLuo, Yuanke     // %t = tileload %r, %c
557496156acSLuo, Yuanke     // tilestore %r, %c, %t
558496156acSLuo, Yuanke     if (HasTileOperand(MRI, MI))
559496156acSLuo, Yuanke       HasUnconfigTile = true;
560496156acSLuo, Yuanke     // According to AMX ABI, all the tile registers including config register
561496156acSLuo, Yuanke     // are volatile. Caller need to save/restore config register.
562496156acSLuo, Yuanke     if (MI.isCall() && HasUnconfigTile) {
563496156acSLuo, Yuanke       MachineBasicBlock::iterator I;
564496156acSLuo, Yuanke       if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
565496156acSLuo, Yuanke         I = ++LastShapeMI->getIterator();
566496156acSLuo, Yuanke       else
567496156acSLuo, Yuanke         I = ++MI.getIterator();
568496156acSLuo, Yuanke       Config(*I);
569496156acSLuo, Yuanke       HasUnconfigTile = false;
570496156acSLuo, Yuanke       continue;
571496156acSLuo, Yuanke     }
572496156acSLuo, Yuanke     if (!isTileDef(MRI, MI))
573496156acSLuo, Yuanke       continue;
574496156acSLuo, Yuanke     //
575496156acSLuo, Yuanke     //---------------------------------------------------------------------
576496156acSLuo, Yuanke     // Don't handle COPY instruction. If the src and dst of the COPY can be
577496156acSLuo, Yuanke     // in the same config in below case, we just check the shape of t0.
578496156acSLuo, Yuanke     // def row0
579496156acSLuo, Yuanke     // def col0
580496156acSLuo, Yuanke     // ldtilecfg
581496156acSLuo, Yuanke     // t0 = tielzero(row0, col0)
582496156acSLuo, Yuanke     // t1 = copy t0
583496156acSLuo, Yuanke     // ...
584496156acSLuo, Yuanke     // If the src and dst of the COPY can NOT be in the same config in below
585496156acSLuo, Yuanke     // case. Reload would be generated befor the copy instruction.
586496156acSLuo, Yuanke     // def row0
587496156acSLuo, Yuanke     // def col0
588496156acSLuo, Yuanke     // t0 = tielzero(row0, col0)
589496156acSLuo, Yuanke     // spill t0
590496156acSLuo, Yuanke     // ...
591496156acSLuo, Yuanke     // def row1
592496156acSLuo, Yuanke     // def col1
593496156acSLuo, Yuanke     // ldtilecfg
594496156acSLuo, Yuanke     // t1 = tilezero(row1, col1)
595496156acSLuo, Yuanke     // reload t0
596496156acSLuo, Yuanke     // t1 = copy t0
597496156acSLuo, Yuanke     //---------------------------------------------------------------------
598496156acSLuo, Yuanke     //
599496156acSLuo, Yuanke     // If MI dominate the last shape def instruction, we need insert
600496156acSLuo, Yuanke     // ldtilecfg after LastShapeMI now. The config doesn't include
601496156acSLuo, Yuanke     // current MI.
602496156acSLuo, Yuanke     //   def row0
603496156acSLuo, Yuanke     //   def col0
604496156acSLuo, Yuanke     //   tilezero(row0, col0)  <- MI
605496156acSLuo, Yuanke     //   def row1
606496156acSLuo, Yuanke     //   def col1
607496156acSLuo, Yuanke     //   ldtilecfg             <- insert
608496156acSLuo, Yuanke     //   tilezero(row1, col1)
609496156acSLuo, Yuanke     if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
610496156acSLuo, Yuanke       Config(*(++LastShapeMI->getIterator()));
611496156acSLuo, Yuanke     MachineOperand *RowMO = &MI.getOperand(1);
612496156acSLuo, Yuanke     MachineOperand *ColMO = &MI.getOperand(2);
613496156acSLuo, Yuanke     MachineInstr *RowMI = MRI->getVRegDef(RowMO->getReg());
614496156acSLuo, Yuanke     MachineInstr *ColMI = MRI->getVRegDef(ColMO->getReg());
615496156acSLuo, Yuanke     // If the shape is defined in current MBB, check the domination.
616496156acSLuo, Yuanke     // FIXME how about loop?
617496156acSLuo, Yuanke     if (RowMI->getParent() == &MBB) {
618496156acSLuo, Yuanke       if (!LastShapeMI)
619496156acSLuo, Yuanke         LastShapeMI = RowMI;
620496156acSLuo, Yuanke       else if (dominates(MBB, LastShapeMI, RowMI))
621496156acSLuo, Yuanke         LastShapeMI = RowMI;
622496156acSLuo, Yuanke     }
623496156acSLuo, Yuanke     if (ColMI->getParent() == &MBB) {
624496156acSLuo, Yuanke       if (!LastShapeMI)
625496156acSLuo, Yuanke         LastShapeMI = ColMI;
626496156acSLuo, Yuanke       else if (dominates(MBB, LastShapeMI, ColMI))
627496156acSLuo, Yuanke         LastShapeMI = ColMI;
628496156acSLuo, Yuanke     }
629*c72a751dSPhoebe Wang     unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg());
630*c72a751dSPhoebe Wang     if (TileDefNum > 1) {
631*c72a751dSPhoebe Wang       for (unsigned I = 1; I < TileDefNum; I++) {
632*c72a751dSPhoebe Wang         MachineOperand *ColxMO = &MI.getOperand(2 + I);
633*c72a751dSPhoebe Wang         MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg());
634*c72a751dSPhoebe Wang         if (ColxMI->getParent() == &MBB) {
635*c72a751dSPhoebe Wang           if (!LastShapeMI)
636*c72a751dSPhoebe Wang             LastShapeMI = ColxMI;
637*c72a751dSPhoebe Wang           else if (dominates(MBB, LastShapeMI, ColxMI))
638*c72a751dSPhoebe Wang             LastShapeMI = ColxMI;
639*c72a751dSPhoebe Wang         }
640*c72a751dSPhoebe Wang       }
641*c72a751dSPhoebe Wang     }
642496156acSLuo, Yuanke     // If there is user live out of the tilecfg, spill it and reload in
643496156acSLuo, Yuanke     // before the user.
644496156acSLuo, Yuanke     Register TileReg = MI.getOperand(0).getReg();
645496156acSLuo, Yuanke     if (mayLiveOut(TileReg, LastTileCfg))
646496156acSLuo, Yuanke       spill(++MI.getIterator(), TileReg, false);
647496156acSLuo, Yuanke     for (MachineInstr &UseMI : MRI->use_instructions(TileReg)) {
648496156acSLuo, Yuanke       if (UseMI.getParent() == &MBB) {
649496156acSLuo, Yuanke         // check user should not across ldtilecfg
650496156acSLuo, Yuanke         if (!LastTileCfg || !dominates(MBB, LastTileCfg, UseMI))
651496156acSLuo, Yuanke           continue;
652496156acSLuo, Yuanke         // reload befor UseMI
653496156acSLuo, Yuanke         reload(UseMI.getIterator(), TileReg, RowMO, ColMO);
654496156acSLuo, Yuanke       } else {
655496156acSLuo, Yuanke         // Don't reload for phi instruction, we handle phi reload separately.
656496156acSLuo, Yuanke         // TODO: merge the reload for the same user MBB.
657496156acSLuo, Yuanke         if (!UseMI.isPHI())
658496156acSLuo, Yuanke           reload(UseMI.getIterator(), TileReg, RowMO, ColMO);
659496156acSLuo, Yuanke       }
660496156acSLuo, Yuanke     }
661496156acSLuo, Yuanke   }
662496156acSLuo, Yuanke 
663496156acSLuo, Yuanke   // Configure tile registers at the head of the MBB
664496156acSLuo, Yuanke   if (HasUnconfigTile) {
665496156acSLuo, Yuanke     MachineInstr *Before;
666496156acSLuo, Yuanke     if (LastShapeMI == nullptr || LastShapeMI->isPHI())
667496156acSLuo, Yuanke       Before = &*MBB.getFirstNonPHI();
668496156acSLuo, Yuanke     else
669496156acSLuo, Yuanke       Before = &*(++LastShapeMI->getIterator());
670496156acSLuo, Yuanke 
671496156acSLuo, Yuanke     Config(*Before);
672496156acSLuo, Yuanke   }
673496156acSLuo, Yuanke 
674496156acSLuo, Yuanke   return Change;
675496156acSLuo, Yuanke }
676496156acSLuo, Yuanke 
677496156acSLuo, Yuanke bool X86FastPreTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
6789a2c8418Saengelke   X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
6799a2c8418Saengelke   // Early exit in the common case of non-AMX code.
6809a2c8418Saengelke   if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
6819a2c8418Saengelke     return false;
6829a2c8418Saengelke 
683496156acSLuo, Yuanke   MF = &MFunc;
684496156acSLuo, Yuanke   MRI = &MFunc.getRegInfo();
685496156acSLuo, Yuanke   ST = &MFunc.getSubtarget<X86Subtarget>();
686496156acSLuo, Yuanke   TII = ST->getInstrInfo();
687496156acSLuo, Yuanke   MFI = &MFunc.getFrameInfo();
688496156acSLuo, Yuanke   TRI = ST->getRegisterInfo();
689496156acSLuo, Yuanke   CfgSS = -1;
690496156acSLuo, Yuanke 
691496156acSLuo, Yuanke   unsigned NumVirtRegs = MRI->getNumVirtRegs();
6923b1de7abSLuo, Yuanke 
693496156acSLuo, Yuanke   StackSlotForVirtReg.resize(NumVirtRegs);
694496156acSLuo, Yuanke   MayLiveAcrossBlocks.clear();
695496156acSLuo, Yuanke   // We will create register during config. *3 is to make sure
696496156acSLuo, Yuanke   // the virtual register number doesn't exceed the size of
697496156acSLuo, Yuanke   // the bit vector.
698496156acSLuo, Yuanke   MayLiveAcrossBlocks.resize(NumVirtRegs * 3);
699496156acSLuo, Yuanke   bool Change = false;
700496156acSLuo, Yuanke   assert(MRI->isSSA());
701496156acSLuo, Yuanke 
702496156acSLuo, Yuanke   // Canonicalize the phi node first.
703496156acSLuo, Yuanke   for (MachineBasicBlock &MBB : MFunc)
704496156acSLuo, Yuanke     canonicalizePHIs(MBB);
705496156acSLuo, Yuanke 
706496156acSLuo, Yuanke   // Loop over all of the basic blocks in reverse post order and insert
707496156acSLuo, Yuanke   // ldtilecfg for tile registers. The reserse post order is to facilitate
708496156acSLuo, Yuanke   // PHI node convert.
709496156acSLuo, Yuanke   ReversePostOrderTraversal<MachineFunction *> RPOT(MF);
710496156acSLuo, Yuanke   for (MachineBasicBlock *MBB : RPOT) {
711496156acSLuo, Yuanke     convertPHIs(*MBB);
712496156acSLuo, Yuanke     Change |= configBasicBlock(*MBB);
713496156acSLuo, Yuanke   }
714496156acSLuo, Yuanke 
715496156acSLuo, Yuanke   if (Change)
716496156acSLuo, Yuanke     InitializeTileConfigStackSpace();
717496156acSLuo, Yuanke 
718496156acSLuo, Yuanke   StackSlotForVirtReg.clear();
719496156acSLuo, Yuanke   return Change;
720496156acSLuo, Yuanke }
721496156acSLuo, Yuanke 
722496156acSLuo, Yuanke FunctionPass *llvm::createX86FastPreTileConfigPass() {
723496156acSLuo, Yuanke   return new X86FastPreTileConfig();
724496156acSLuo, Yuanke }
725