xref: /netbsd-src/external/apache2/llvm/dist/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/CodeGen/FunctionLoweringInfo.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 
26 #define DEBUG_TYPE "amdgpu-call-lowering"
27 
28 using namespace llvm;
29 
30 namespace {
31 
32 /// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
extendRegisterMin32(CallLowering::ValueHandler & Handler,Register ValVReg,CCValAssign & VA)33 static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
34                                     Register ValVReg, CCValAssign &VA) {
35   if (VA.getLocVT().getSizeInBits() < 32) {
36     // 16-bit types are reported as legal for 32-bit registers. We need to
37     // extend and do a 32-bit copy to avoid the verifier complaining about it.
38     return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
39   }
40 
41   return Handler.extendRegister(ValVReg, VA);
42 }
43 
44 struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
AMDGPUOutgoingValueHandler__anonbc34d5ff0111::AMDGPUOutgoingValueHandler45   AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
46                              MachineInstrBuilder MIB)
47       : OutgoingValueHandler(B, MRI), MIB(MIB) {}
48 
49   MachineInstrBuilder MIB;
50 
getStackAddress__anonbc34d5ff0111::AMDGPUOutgoingValueHandler51   Register getStackAddress(uint64_t Size, int64_t Offset,
52                            MachinePointerInfo &MPO,
53                            ISD::ArgFlagsTy Flags) override {
54     llvm_unreachable("not implemented");
55   }
56 
assignValueToAddress__anonbc34d5ff0111::AMDGPUOutgoingValueHandler57   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
58                             MachinePointerInfo &MPO, CCValAssign &VA) override {
59     llvm_unreachable("not implemented");
60   }
61 
assignValueToReg__anonbc34d5ff0111::AMDGPUOutgoingValueHandler62   void assignValueToReg(Register ValVReg, Register PhysReg,
63                         CCValAssign &VA) override {
64     Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
65 
66     // If this is a scalar return, insert a readfirstlane just in case the value
67     // ends up in a VGPR.
68     // FIXME: Assert this is a shader return.
69     const SIRegisterInfo *TRI
70       = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
71     if (TRI->isSGPRReg(MRI, PhysReg)) {
72       auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
73                                               {MRI.getType(ExtReg)}, false)
74         .addReg(ExtReg);
75       ExtReg = ToSGPR.getReg(0);
76     }
77 
78     MIRBuilder.buildCopy(PhysReg, ExtReg);
79     MIB.addUse(PhysReg, RegState::Implicit);
80   }
81 };
82 
83 struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
84   uint64_t StackUsed = 0;
85 
AMDGPUIncomingArgHandler__anonbc34d5ff0111::AMDGPUIncomingArgHandler86   AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
87       : IncomingValueHandler(B, MRI) {}
88 
getStackAddress__anonbc34d5ff0111::AMDGPUIncomingArgHandler89   Register getStackAddress(uint64_t Size, int64_t Offset,
90                            MachinePointerInfo &MPO,
91                            ISD::ArgFlagsTy Flags) override {
92     auto &MFI = MIRBuilder.getMF().getFrameInfo();
93 
94     // Byval is assumed to be writable memory, but other stack passed arguments
95     // are not.
96     const bool IsImmutable = !Flags.isByVal();
97     int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
98     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
99     auto AddrReg = MIRBuilder.buildFrameIndex(
100         LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
101     StackUsed = std::max(StackUsed, Size + Offset);
102     return AddrReg.getReg(0);
103   }
104 
assignValueToReg__anonbc34d5ff0111::AMDGPUIncomingArgHandler105   void assignValueToReg(Register ValVReg, Register PhysReg,
106                         CCValAssign &VA) override {
107     markPhysRegUsed(PhysReg);
108 
109     if (VA.getLocVT().getSizeInBits() < 32) {
110       // 16-bit types are reported as legal for 32-bit registers. We need to do
111       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
112       auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
113 
114       // If we have signext/zeroext, it applies to the whole 32-bit register
115       // before truncation.
116       auto Extended =
117           buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
118       MIRBuilder.buildTrunc(ValVReg, Extended);
119       return;
120     }
121 
122     IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
123   }
124 
assignValueToAddress__anonbc34d5ff0111::AMDGPUIncomingArgHandler125   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
126                             MachinePointerInfo &MPO, CCValAssign &VA) override {
127     MachineFunction &MF = MIRBuilder.getMF();
128 
129     // The reported memory location may be wider than the value.
130     const LLT RegTy = MRI.getType(ValVReg);
131     MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
132 
133     // FIXME: Get alignment
134     auto MMO = MF.getMachineMemOperand(
135         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
136         inferAlignFromPtrInfo(MF, MPO));
137     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
138   }
139 
140   /// How the physical register gets marked varies between formal
141   /// parameters (it's a basic-block live-in), and a call instruction
142   /// (it's an implicit-def of the BL).
143   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
144 };
145 
146 struct FormalArgHandler : public AMDGPUIncomingArgHandler {
FormalArgHandler__anonbc34d5ff0111::FormalArgHandler147   FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
148       : AMDGPUIncomingArgHandler(B, MRI) {}
149 
markPhysRegUsed__anonbc34d5ff0111::FormalArgHandler150   void markPhysRegUsed(unsigned PhysReg) override {
151     MIRBuilder.getMBB().addLiveIn(PhysReg);
152   }
153 };
154 
155 struct CallReturnHandler : public AMDGPUIncomingArgHandler {
CallReturnHandler__anonbc34d5ff0111::CallReturnHandler156   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
157                     MachineInstrBuilder MIB)
158       : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
159 
markPhysRegUsed__anonbc34d5ff0111::CallReturnHandler160   void markPhysRegUsed(unsigned PhysReg) override {
161     MIB.addDef(PhysReg, RegState::Implicit);
162   }
163 
164   MachineInstrBuilder MIB;
165 };
166 
167 struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
168   /// For tail calls, the byte offset of the call's argument area from the
169   /// callee's. Unused elsewhere.
170   int FPDiff;
171 
172   // Cache the SP register vreg if we need it more than once in this call site.
173   Register SPReg;
174 
175   bool IsTailCall;
176 
AMDGPUOutgoingArgHandler__anonbc34d5ff0111::AMDGPUOutgoingArgHandler177   AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
178                            MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
179                            bool IsTailCall = false, int FPDiff = 0)
180       : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
181         IsTailCall(IsTailCall) {}
182 
getStackAddress__anonbc34d5ff0111::AMDGPUOutgoingArgHandler183   Register getStackAddress(uint64_t Size, int64_t Offset,
184                            MachinePointerInfo &MPO,
185                            ISD::ArgFlagsTy Flags) override {
186     MachineFunction &MF = MIRBuilder.getMF();
187     const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
188     const LLT S32 = LLT::scalar(32);
189 
190     if (IsTailCall) {
191       Offset += FPDiff;
192       int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
193       auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
194       MPO = MachinePointerInfo::getFixedStack(MF, FI);
195       return FIReg.getReg(0);
196     }
197 
198     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
199 
200     if (!SPReg)
201       SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
202 
203     auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
204 
205     auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
206     MPO = MachinePointerInfo::getStack(MF, Offset);
207     return AddrReg.getReg(0);
208   }
209 
assignValueToReg__anonbc34d5ff0111::AMDGPUOutgoingArgHandler210   void assignValueToReg(Register ValVReg, Register PhysReg,
211                         CCValAssign &VA) override {
212     MIB.addUse(PhysReg, RegState::Implicit);
213     Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
214     MIRBuilder.buildCopy(PhysReg, ExtReg);
215   }
216 
assignValueToAddress__anonbc34d5ff0111::AMDGPUOutgoingArgHandler217   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
218                             MachinePointerInfo &MPO, CCValAssign &VA) override {
219     MachineFunction &MF = MIRBuilder.getMF();
220     uint64_t LocMemOffset = VA.getLocMemOffset();
221     const auto &ST = MF.getSubtarget<GCNSubtarget>();
222 
223     auto MMO = MF.getMachineMemOperand(
224       MPO, MachineMemOperand::MOStore, Size,
225       commonAlignment(ST.getStackAlignment(), LocMemOffset));
226     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
227   }
228 
assignValueToAddress__anonbc34d5ff0111::AMDGPUOutgoingArgHandler229   void assignValueToAddress(const CallLowering::ArgInfo &Arg,
230                             unsigned ValRegIndex, Register Addr,
231                             uint64_t MemSize, MachinePointerInfo &MPO,
232                             CCValAssign &VA) override {
233     Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
234                            ? extendRegister(Arg.Regs[ValRegIndex], VA)
235                            : Arg.Regs[ValRegIndex];
236 
237     // If we extended the value type we might need to adjust the MMO's
238     // Size. This happens if ComputeValueVTs widened a small type value to a
239     // legal register type (e.g. s8->s16)
240     const LLT RegTy = MRI.getType(ValVReg);
241     MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes());
242     assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
243   }
244 };
245 }
246 
AMDGPUCallLowering(const AMDGPUTargetLowering & TLI)247 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
248   : CallLowering(&TLI) {
249 }
250 
251 // FIXME: Compatability shim
extOpcodeToISDExtOpcode(unsigned MIOpc)252 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
253   switch (MIOpc) {
254   case TargetOpcode::G_SEXT:
255     return ISD::SIGN_EXTEND;
256   case TargetOpcode::G_ZEXT:
257     return ISD::ZERO_EXTEND;
258   case TargetOpcode::G_ANYEXT:
259     return ISD::ANY_EXTEND;
260   default:
261     llvm_unreachable("not an extend opcode");
262   }
263 }
264 
canLowerReturn(MachineFunction & MF,CallingConv::ID CallConv,SmallVectorImpl<BaseArgInfo> & Outs,bool IsVarArg) const265 bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
266                                         CallingConv::ID CallConv,
267                                         SmallVectorImpl<BaseArgInfo> &Outs,
268                                         bool IsVarArg) const {
269   // For shaders. Vector types should be explicitly handled by CC.
270   if (AMDGPU::isEntryFunctionCC(CallConv))
271     return true;
272 
273   SmallVector<CCValAssign, 16> ArgLocs;
274   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
275   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
276                  MF.getFunction().getContext());
277 
278   return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
279 }
280 
281 /// Lower the return value for the already existing \p Ret. This assumes that
282 /// \p B's insertion point is correct.
lowerReturnVal(MachineIRBuilder & B,const Value * Val,ArrayRef<Register> VRegs,MachineInstrBuilder & Ret) const283 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
284                                         const Value *Val, ArrayRef<Register> VRegs,
285                                         MachineInstrBuilder &Ret) const {
286   if (!Val)
287     return true;
288 
289   auto &MF = B.getMF();
290   const auto &F = MF.getFunction();
291   const DataLayout &DL = MF.getDataLayout();
292   MachineRegisterInfo *MRI = B.getMRI();
293   LLVMContext &Ctx = F.getContext();
294 
295   CallingConv::ID CC = F.getCallingConv();
296   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
297 
298   SmallVector<EVT, 8> SplitEVTs;
299   ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
300   assert(VRegs.size() == SplitEVTs.size() &&
301          "For each split Type there should be exactly one VReg.");
302 
303   SmallVector<ArgInfo, 8> SplitRetInfos;
304 
305   for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
306     EVT VT = SplitEVTs[i];
307     Register Reg = VRegs[i];
308     ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx));
309     setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
310 
311     if (VT.isScalarInteger()) {
312       unsigned ExtendOp = TargetOpcode::G_ANYEXT;
313       if (RetInfo.Flags[0].isSExt()) {
314         assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
315         ExtendOp = TargetOpcode::G_SEXT;
316       } else if (RetInfo.Flags[0].isZExt()) {
317         assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
318         ExtendOp = TargetOpcode::G_ZEXT;
319       }
320 
321       EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
322                                           extOpcodeToISDExtOpcode(ExtendOp));
323       if (ExtVT != VT) {
324         RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
325         LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
326         Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
327       }
328     }
329 
330     if (Reg != RetInfo.Regs[0]) {
331       RetInfo.Regs[0] = Reg;
332       // Reset the arg flags after modifying Reg.
333       setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
334     }
335 
336     splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
337   }
338 
339   CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
340 
341   OutgoingValueAssigner Assigner(AssignFn);
342   AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
343   return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
344                                        CC, F.isVarArg());
345 }
346 
lowerReturn(MachineIRBuilder & B,const Value * Val,ArrayRef<Register> VRegs,FunctionLoweringInfo & FLI) const347 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
348                                      ArrayRef<Register> VRegs,
349                                      FunctionLoweringInfo &FLI) const {
350 
351   MachineFunction &MF = B.getMF();
352   MachineRegisterInfo &MRI = MF.getRegInfo();
353   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
354   MFI->setIfReturnsVoid(!Val);
355 
356   assert(!Val == VRegs.empty() && "Return value without a vreg");
357 
358   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
359   const bool IsShader = AMDGPU::isShader(CC);
360   const bool IsWaveEnd =
361       (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
362   if (IsWaveEnd) {
363     B.buildInstr(AMDGPU::S_ENDPGM)
364       .addImm(0);
365     return true;
366   }
367 
368   auto const &ST = MF.getSubtarget<GCNSubtarget>();
369 
370   unsigned ReturnOpc =
371       IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
372 
373   auto Ret = B.buildInstrNoInsert(ReturnOpc);
374   Register ReturnAddrVReg;
375   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
376     ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
377     Ret.addUse(ReturnAddrVReg);
378   }
379 
380   if (!FLI.CanLowerReturn)
381     insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
382   else if (!lowerReturnVal(B, Val, VRegs, Ret))
383     return false;
384 
385   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
386     const SIRegisterInfo *TRI = ST.getRegisterInfo();
387     Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
388                                          &AMDGPU::SGPR_64RegClass);
389     B.buildCopy(ReturnAddrVReg, LiveInReturn);
390   }
391 
392   // TODO: Handle CalleeSavedRegsViaCopy.
393 
394   B.insertInstr(Ret);
395   return true;
396 }
397 
lowerParameterPtr(Register DstReg,MachineIRBuilder & B,Type * ParamTy,uint64_t Offset) const398 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
399                                            Type *ParamTy,
400                                            uint64_t Offset) const {
401   MachineFunction &MF = B.getMF();
402   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
403   MachineRegisterInfo &MRI = MF.getRegInfo();
404   Register KernArgSegmentPtr =
405     MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
406   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
407 
408   auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
409 
410   B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
411 }
412 
lowerParameter(MachineIRBuilder & B,Type * ParamTy,uint64_t Offset,Align Alignment,Register DstReg) const413 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
414                                         uint64_t Offset, Align Alignment,
415                                         Register DstReg) const {
416   MachineFunction &MF = B.getMF();
417   const Function &F = MF.getFunction();
418   const DataLayout &DL = F.getParent()->getDataLayout();
419   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
420   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
421 
422   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
423   Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
424   lowerParameterPtr(PtrReg, B, ParamTy, Offset);
425 
426   MachineMemOperand *MMO = MF.getMachineMemOperand(
427       PtrInfo,
428       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
429           MachineMemOperand::MOInvariant,
430       TypeSize, Alignment);
431 
432   B.buildLoad(DstReg, PtrReg, *MMO);
433 }
434 
435 // Allocate special inputs passed in user SGPRs.
allocateHSAUserSGPRs(CCState & CCInfo,MachineIRBuilder & B,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info)436 static void allocateHSAUserSGPRs(CCState &CCInfo,
437                                  MachineIRBuilder &B,
438                                  MachineFunction &MF,
439                                  const SIRegisterInfo &TRI,
440                                  SIMachineFunctionInfo &Info) {
441   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
442   if (Info.hasPrivateSegmentBuffer()) {
443     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
444     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
445     CCInfo.AllocateReg(PrivateSegmentBufferReg);
446   }
447 
448   if (Info.hasDispatchPtr()) {
449     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
450     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
451     CCInfo.AllocateReg(DispatchPtrReg);
452   }
453 
454   if (Info.hasQueuePtr()) {
455     Register QueuePtrReg = Info.addQueuePtr(TRI);
456     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
457     CCInfo.AllocateReg(QueuePtrReg);
458   }
459 
460   if (Info.hasKernargSegmentPtr()) {
461     MachineRegisterInfo &MRI = MF.getRegInfo();
462     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
463     const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
464     Register VReg = MRI.createGenericVirtualRegister(P4);
465     MRI.addLiveIn(InputPtrReg, VReg);
466     B.getMBB().addLiveIn(InputPtrReg);
467     B.buildCopy(VReg, InputPtrReg);
468     CCInfo.AllocateReg(InputPtrReg);
469   }
470 
471   if (Info.hasDispatchID()) {
472     Register DispatchIDReg = Info.addDispatchID(TRI);
473     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
474     CCInfo.AllocateReg(DispatchIDReg);
475   }
476 
477   if (Info.hasFlatScratchInit()) {
478     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
479     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
480     CCInfo.AllocateReg(FlatScratchInitReg);
481   }
482 
483   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
484   // these from the dispatch pointer.
485 }
486 
lowerFormalArgumentsKernel(MachineIRBuilder & B,const Function & F,ArrayRef<ArrayRef<Register>> VRegs) const487 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
488     MachineIRBuilder &B, const Function &F,
489     ArrayRef<ArrayRef<Register>> VRegs) const {
490   MachineFunction &MF = B.getMF();
491   const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
492   MachineRegisterInfo &MRI = MF.getRegInfo();
493   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
494   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
495   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
496   const DataLayout &DL = F.getParent()->getDataLayout();
497 
498   Info->allocateModuleLDSGlobal(F.getParent());
499 
500   SmallVector<CCValAssign, 16> ArgLocs;
501   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
502 
503   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
504 
505   unsigned i = 0;
506   const Align KernArgBaseAlign(16);
507   const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
508   uint64_t ExplicitArgOffset = 0;
509 
510   // TODO: Align down to dword alignment and extract bits for extending loads.
511   for (auto &Arg : F.args()) {
512     const bool IsByRef = Arg.hasByRefAttr();
513     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
514     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
515     if (AllocSize == 0)
516       continue;
517 
518     MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
519     if (!ABIAlign)
520       ABIAlign = DL.getABITypeAlign(ArgTy);
521 
522     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
523     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
524 
525     if (Arg.use_empty()) {
526       ++i;
527       continue;
528     }
529 
530     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
531 
532     if (IsByRef) {
533       unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
534 
535       assert(VRegs[i].size() == 1 &&
536              "expected only one register for byval pointers");
537       if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
538         lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
539       } else {
540         const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
541         Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
542         lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
543 
544         B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
545       }
546     } else {
547       ArrayRef<Register> OrigArgRegs = VRegs[i];
548       Register ArgReg =
549         OrigArgRegs.size() == 1
550         ? OrigArgRegs[0]
551         : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
552 
553       lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
554       if (OrigArgRegs.size() > 1)
555         unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
556     }
557 
558     ++i;
559   }
560 
561   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
562   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
563   return true;
564 }
565 
lowerFormalArguments(MachineIRBuilder & B,const Function & F,ArrayRef<ArrayRef<Register>> VRegs,FunctionLoweringInfo & FLI) const566 bool AMDGPUCallLowering::lowerFormalArguments(
567     MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
568     FunctionLoweringInfo &FLI) const {
569   CallingConv::ID CC = F.getCallingConv();
570 
571   // The infrastructure for normal calling convention lowering is essentially
572   // useless for kernels. We want to avoid any kind of legalization or argument
573   // splitting.
574   if (CC == CallingConv::AMDGPU_KERNEL)
575     return lowerFormalArgumentsKernel(B, F, VRegs);
576 
577   const bool IsGraphics = AMDGPU::isGraphics(CC);
578   const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
579 
580   MachineFunction &MF = B.getMF();
581   MachineBasicBlock &MBB = B.getMBB();
582   MachineRegisterInfo &MRI = MF.getRegInfo();
583   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
584   const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
585   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
586   const DataLayout &DL = F.getParent()->getDataLayout();
587 
588   Info->allocateModuleLDSGlobal(F.getParent());
589 
590   SmallVector<CCValAssign, 16> ArgLocs;
591   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
592 
593   if (!IsEntryFunc) {
594     Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
595     Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
596                                          &AMDGPU::SGPR_64RegClass);
597     MBB.addLiveIn(ReturnAddrReg);
598     B.buildCopy(LiveInReturn, ReturnAddrReg);
599   }
600 
601   if (Info->hasImplicitBufferPtr()) {
602     Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
603     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
604     CCInfo.AllocateReg(ImplicitBufferPtrReg);
605   }
606 
607   SmallVector<ArgInfo, 32> SplitArgs;
608   unsigned Idx = 0;
609   unsigned PSInputNum = 0;
610 
611   // Insert the hidden sret parameter if the return value won't fit in the
612   // return registers.
613   if (!FLI.CanLowerReturn)
614     insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
615 
616   for (auto &Arg : F.args()) {
617     if (DL.getTypeStoreSize(Arg.getType()) == 0)
618       continue;
619 
620     const bool InReg = Arg.hasAttribute(Attribute::InReg);
621 
622     // SGPR arguments to functions not implemented.
623     if (!IsGraphics && InReg)
624       return false;
625 
626     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
627         Arg.hasAttribute(Attribute::SwiftError) ||
628         Arg.hasAttribute(Attribute::Nest))
629       return false;
630 
631     if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
632       const bool ArgUsed = !Arg.use_empty();
633       bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
634 
635       if (!SkipArg) {
636         Info->markPSInputAllocated(PSInputNum);
637         if (ArgUsed)
638           Info->markPSInputEnabled(PSInputNum);
639       }
640 
641       ++PSInputNum;
642 
643       if (SkipArg) {
644         for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
645           B.buildUndef(VRegs[Idx][I]);
646 
647         ++Idx;
648         continue;
649       }
650     }
651 
652     ArgInfo OrigArg(VRegs[Idx], Arg);
653     const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
654     setArgFlags(OrigArg, OrigArgIdx, DL, F);
655 
656     splitToValueTypes(OrigArg, SplitArgs, DL, CC);
657     ++Idx;
658   }
659 
660   // At least one interpolation mode must be enabled or else the GPU will
661   // hang.
662   //
663   // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
664   // set PSInputAddr, the user wants to enable some bits after the compilation
665   // based on run-time states. Since we can't know what the final PSInputEna
666   // will look like, so we shouldn't do anything here and the user should take
667   // responsibility for the correct programming.
668   //
669   // Otherwise, the following restrictions apply:
670   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
671   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
672   //   enabled too.
673   if (CC == CallingConv::AMDGPU_PS) {
674     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
675         ((Info->getPSInputAddr() & 0xF) == 0 &&
676          Info->isPSInputAllocated(11))) {
677       CCInfo.AllocateReg(AMDGPU::VGPR0);
678       CCInfo.AllocateReg(AMDGPU::VGPR1);
679       Info->markPSInputAllocated(0);
680       Info->markPSInputEnabled(0);
681     }
682 
683     if (Subtarget.isAmdPalOS()) {
684       // For isAmdPalOS, the user does not enable some bits after compilation
685       // based on run-time states; the register values being generated here are
686       // the final ones set in hardware. Therefore we need to apply the
687       // workaround to PSInputAddr and PSInputEnable together.  (The case where
688       // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
689       // set up an input arg for a particular interpolation mode, but nothing
690       // uses that input arg. Really we should have an earlier pass that removes
691       // such an arg.)
692       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
693       if ((PsInputBits & 0x7F) == 0 ||
694           ((PsInputBits & 0xF) == 0 &&
695            (PsInputBits >> 11 & 1)))
696         Info->markPSInputEnabled(
697           countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
698     }
699   }
700 
701   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
702   CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
703 
704   if (!MBB.empty())
705     B.setInstr(*MBB.begin());
706 
707   if (!IsEntryFunc) {
708     // For the fixed ABI, pass workitem IDs in the last argument register.
709     if (AMDGPUTargetMachine::EnableFixedFunctionABI)
710       TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
711   }
712 
713   IncomingValueAssigner Assigner(AssignFn);
714   if (!determineAssignments(Assigner, SplitArgs, CCInfo))
715     return false;
716 
717   FormalArgHandler Handler(B, MRI);
718   if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
719     return false;
720 
721   uint64_t StackOffset = Assigner.StackOffset;
722 
723   if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
724     // Special inputs come after user arguments.
725     TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
726   }
727 
728   // Start adding system SGPRs.
729   if (IsEntryFunc) {
730     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
731   } else {
732     if (!Subtarget.enableFlatScratch())
733       CCInfo.AllocateReg(Info->getScratchRSrcReg());
734     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
735   }
736 
737   // When we tail call, we need to check if the callee's arguments will fit on
738   // the caller's stack. So, whenever we lower formal arguments, we should keep
739   // track of this information, since we might lower a tail call in this
740   // function later.
741   Info->setBytesInStackArgArea(StackOffset);
742 
743   // Move back to the end of the basic block.
744   B.setMBB(MBB);
745 
746   return true;
747 }
748 
passSpecialInputs(MachineIRBuilder & MIRBuilder,CCState & CCInfo,SmallVectorImpl<std::pair<MCRegister,Register>> & ArgRegs,CallLoweringInfo & Info) const749 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
750                                            CCState &CCInfo,
751                                            SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
752                                            CallLoweringInfo &Info) const {
753   MachineFunction &MF = MIRBuilder.getMF();
754 
755   const AMDGPUFunctionArgInfo *CalleeArgInfo
756     = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
757 
758   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
759   const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
760 
761 
762   // TODO: Unify with private memory register handling. This is complicated by
763   // the fact that at least in kernels, the input argument is not necessarily
764   // in the same location as the input.
765   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
766     AMDGPUFunctionArgInfo::DISPATCH_PTR,
767     AMDGPUFunctionArgInfo::QUEUE_PTR,
768     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
769     AMDGPUFunctionArgInfo::DISPATCH_ID,
770     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
771     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
772     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
773   };
774 
775   MachineRegisterInfo &MRI = MF.getRegInfo();
776 
777   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
778   const AMDGPULegalizerInfo *LI
779     = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
780 
781   for (auto InputID : InputRegs) {
782     const ArgDescriptor *OutgoingArg;
783     const TargetRegisterClass *ArgRC;
784     LLT ArgTy;
785 
786     std::tie(OutgoingArg, ArgRC, ArgTy) =
787         CalleeArgInfo->getPreloadedValue(InputID);
788     if (!OutgoingArg)
789       continue;
790 
791     const ArgDescriptor *IncomingArg;
792     const TargetRegisterClass *IncomingArgRC;
793     std::tie(IncomingArg, IncomingArgRC, ArgTy) =
794         CallerArgInfo.getPreloadedValue(InputID);
795     assert(IncomingArgRC == ArgRC);
796 
797     Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
798 
799     if (IncomingArg) {
800       LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
801     } else {
802       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
803       LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
804     }
805 
806     if (OutgoingArg->isRegister()) {
807       ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
808       if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
809         report_fatal_error("failed to allocate implicit input argument");
810     } else {
811       LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
812       return false;
813     }
814   }
815 
816   // Pack workitem IDs into a single register or pass it as is if already
817   // packed.
818   const ArgDescriptor *OutgoingArg;
819   const TargetRegisterClass *ArgRC;
820   LLT ArgTy;
821 
822   std::tie(OutgoingArg, ArgRC, ArgTy) =
823       CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
824   if (!OutgoingArg)
825     std::tie(OutgoingArg, ArgRC, ArgTy) =
826         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
827   if (!OutgoingArg)
828     std::tie(OutgoingArg, ArgRC, ArgTy) =
829         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
830   if (!OutgoingArg)
831     return false;
832 
833   auto WorkitemIDX =
834       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
835   auto WorkitemIDY =
836       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
837   auto WorkitemIDZ =
838       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
839 
840   const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
841   const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
842   const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
843   const LLT S32 = LLT::scalar(32);
844 
845   // If incoming ids are not packed we need to pack them.
846   // FIXME: Should consider known workgroup size to eliminate known 0 cases.
847   Register InputReg;
848   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
849     InputReg = MRI.createGenericVirtualRegister(S32);
850     LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
851                        std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
852   }
853 
854   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
855     Register Y = MRI.createGenericVirtualRegister(S32);
856     LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
857                        std::get<2>(WorkitemIDY));
858 
859     Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
860     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
861   }
862 
863   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
864     Register Z = MRI.createGenericVirtualRegister(S32);
865     LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
866                        std::get<2>(WorkitemIDZ));
867 
868     Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
869     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
870   }
871 
872   if (!InputReg) {
873     InputReg = MRI.createGenericVirtualRegister(S32);
874 
875     // Workitem ids are already packed, any of present incoming arguments will
876     // carry all required fields.
877     ArgDescriptor IncomingArg = ArgDescriptor::createArg(
878       IncomingArgX ? *IncomingArgX :
879         IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
880     LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
881                        &AMDGPU::VGPR_32RegClass, S32);
882   }
883 
884   if (OutgoingArg->isRegister()) {
885     ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
886     if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
887       report_fatal_error("failed to allocate implicit input argument");
888   } else {
889     LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
890     return false;
891   }
892 
893   return true;
894 }
895 
896 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
897 /// CC.
898 static std::pair<CCAssignFn *, CCAssignFn *>
getAssignFnsForCC(CallingConv::ID CC,const SITargetLowering & TLI)899 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
900   return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
901 }
902 
getCallOpcode(const MachineFunction & CallerF,bool IsIndirect,bool IsTailCall)903 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
904                               bool IsTailCall) {
905   return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::SI_CALL;
906 }
907 
908 // Add operands to call instruction to track the callee.
addCallTargetOperands(MachineInstrBuilder & CallInst,MachineIRBuilder & MIRBuilder,AMDGPUCallLowering::CallLoweringInfo & Info)909 static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
910                                   MachineIRBuilder &MIRBuilder,
911                                   AMDGPUCallLowering::CallLoweringInfo &Info) {
912   if (Info.Callee.isReg()) {
913     CallInst.addReg(Info.Callee.getReg());
914     CallInst.addImm(0);
915   } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
916     // The call lowering lightly assumed we can directly encode a call target in
917     // the instruction, which is not the case. Materialize the address here.
918     const GlobalValue *GV = Info.Callee.getGlobal();
919     auto Ptr = MIRBuilder.buildGlobalValue(
920       LLT::pointer(GV->getAddressSpace(), 64), GV);
921     CallInst.addReg(Ptr.getReg(0));
922     CallInst.add(Info.Callee);
923   } else
924     return false;
925 
926   return true;
927 }
928 
doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo & Info,MachineFunction & MF,SmallVectorImpl<ArgInfo> & InArgs) const929 bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
930     CallLoweringInfo &Info, MachineFunction &MF,
931     SmallVectorImpl<ArgInfo> &InArgs) const {
932   const Function &CallerF = MF.getFunction();
933   CallingConv::ID CalleeCC = Info.CallConv;
934   CallingConv::ID CallerCC = CallerF.getCallingConv();
935 
936   // If the calling conventions match, then everything must be the same.
937   if (CalleeCC == CallerCC)
938     return true;
939 
940   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
941 
942   // Make sure that the caller and callee preserve all of the same registers.
943   auto TRI = ST.getRegisterInfo();
944 
945   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
946   const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
947   if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
948     return false;
949 
950   // Check if the caller and callee will handle arguments in the same way.
951   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
952   CCAssignFn *CalleeAssignFnFixed;
953   CCAssignFn *CalleeAssignFnVarArg;
954   std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
955       getAssignFnsForCC(CalleeCC, TLI);
956 
957   CCAssignFn *CallerAssignFnFixed;
958   CCAssignFn *CallerAssignFnVarArg;
959   std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
960       getAssignFnsForCC(CallerCC, TLI);
961 
962   // FIXME: We are not accounting for potential differences in implicitly passed
963   // inputs, but only the fixed ABI is supported now anyway.
964   IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
965                                        CalleeAssignFnVarArg);
966   IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
967                                        CallerAssignFnVarArg);
968   return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
969 }
970 
areCalleeOutgoingArgsTailCallable(CallLoweringInfo & Info,MachineFunction & MF,SmallVectorImpl<ArgInfo> & OutArgs) const971 bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
972     CallLoweringInfo &Info, MachineFunction &MF,
973     SmallVectorImpl<ArgInfo> &OutArgs) const {
974   // If there are no outgoing arguments, then we are done.
975   if (OutArgs.empty())
976     return true;
977 
978   const Function &CallerF = MF.getFunction();
979   CallingConv::ID CalleeCC = Info.CallConv;
980   CallingConv::ID CallerCC = CallerF.getCallingConv();
981   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
982 
983   CCAssignFn *AssignFnFixed;
984   CCAssignFn *AssignFnVarArg;
985   std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
986 
987   // We have outgoing arguments. Make sure that we can tail call with them.
988   SmallVector<CCValAssign, 16> OutLocs;
989   CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
990   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
991 
992   if (!determineAssignments(Assigner, OutArgs, OutInfo)) {
993     LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
994     return false;
995   }
996 
997   // Make sure that they can fit on the caller's stack.
998   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
999   if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
1000     LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1001     return false;
1002   }
1003 
1004   // Verify that the parameters in callee-saved registers match.
1005   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1006   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1007   const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1008   MachineRegisterInfo &MRI = MF.getRegInfo();
1009   return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
1010 }
1011 
1012 /// Return true if the calling convention is one that we can guarantee TCO for.
canGuaranteeTCO(CallingConv::ID CC)1013 static bool canGuaranteeTCO(CallingConv::ID CC) {
1014   return CC == CallingConv::Fast;
1015 }
1016 
1017 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)1018 static bool mayTailCallThisCC(CallingConv::ID CC) {
1019   switch (CC) {
1020   case CallingConv::C:
1021   case CallingConv::AMDGPU_Gfx:
1022     return true;
1023   default:
1024     return canGuaranteeTCO(CC);
1025   }
1026 }
1027 
isEligibleForTailCallOptimization(MachineIRBuilder & B,CallLoweringInfo & Info,SmallVectorImpl<ArgInfo> & InArgs,SmallVectorImpl<ArgInfo> & OutArgs) const1028 bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1029     MachineIRBuilder &B, CallLoweringInfo &Info,
1030     SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1031   // Must pass all target-independent checks in order to tail call optimize.
1032   if (!Info.IsTailCall)
1033     return false;
1034 
1035   MachineFunction &MF = B.getMF();
1036   const Function &CallerF = MF.getFunction();
1037   CallingConv::ID CalleeCC = Info.CallConv;
1038   CallingConv::ID CallerCC = CallerF.getCallingConv();
1039 
1040   const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1041   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1042   // Kernels aren't callable, and don't have a live in return address so it
1043   // doesn't make sense to do a tail call with entry functions.
1044   if (!CallerPreserved)
1045     return false;
1046 
1047   if (!mayTailCallThisCC(CalleeCC)) {
1048     LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1049     return false;
1050   }
1051 
1052   if (any_of(CallerF.args(), [](const Argument &A) {
1053         return A.hasByValAttr() || A.hasSwiftErrorAttr();
1054       })) {
1055     LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1056                          "or swifterror arguments\n");
1057     return false;
1058   }
1059 
1060   // If we have -tailcallopt, then we're done.
1061   if (MF.getTarget().Options.GuaranteedTailCallOpt)
1062     return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
1063 
1064   // Verify that the incoming and outgoing arguments from the callee are
1065   // safe to tail call.
1066   if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1067     LLVM_DEBUG(
1068         dbgs()
1069         << "... Caller and callee have incompatible calling conventions.\n");
1070     return false;
1071   }
1072 
1073   if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1074     return false;
1075 
1076   LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1077   return true;
1078 }
1079 
1080 // Insert outgoing implicit arguments for a call, by inserting copies to the
1081 // implicit argument registers and adding the necessary implicit uses to the
1082 // call instruction.
handleImplicitCallArguments(MachineIRBuilder & MIRBuilder,MachineInstrBuilder & CallInst,const GCNSubtarget & ST,const SIMachineFunctionInfo & FuncInfo,ArrayRef<std::pair<MCRegister,Register>> ImplicitArgRegs) const1083 void AMDGPUCallLowering::handleImplicitCallArguments(
1084     MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
1085     const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1086     ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1087   if (!ST.enableFlatScratch()) {
1088     // Insert copies for the SRD. In the HSA case, this should be an identity
1089     // copy.
1090     auto ScratchRSrcReg =
1091         MIRBuilder.buildCopy(LLT::vector(4, 32), FuncInfo.getScratchRSrcReg());
1092     MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
1093     CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
1094   }
1095 
1096   for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1097     MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1098     CallInst.addReg(ArgReg.first, RegState::Implicit);
1099   }
1100 }
1101 
lowerTailCall(MachineIRBuilder & MIRBuilder,CallLoweringInfo & Info,SmallVectorImpl<ArgInfo> & OutArgs) const1102 bool AMDGPUCallLowering::lowerTailCall(
1103     MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1104     SmallVectorImpl<ArgInfo> &OutArgs) const {
1105   MachineFunction &MF = MIRBuilder.getMF();
1106   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1107   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1108   const Function &F = MF.getFunction();
1109   MachineRegisterInfo &MRI = MF.getRegInfo();
1110   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1111 
1112   // True when we're tail calling, but without -tailcallopt.
1113   bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1114 
1115   // Find out which ABI gets to decide where things go.
1116   CallingConv::ID CalleeCC = Info.CallConv;
1117   CCAssignFn *AssignFnFixed;
1118   CCAssignFn *AssignFnVarArg;
1119   std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
1120 
1121   MachineInstrBuilder CallSeqStart;
1122   if (!IsSibCall)
1123     CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
1124 
1125   unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
1126   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1127   if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1128     return false;
1129 
1130   // Byte offset for the tail call. When we are sibcalling, this will always
1131   // be 0.
1132   MIB.addImm(0);
1133 
1134   // Tell the call which registers are clobbered.
1135   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1136   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1137   MIB.addRegMask(Mask);
1138 
1139   // FPDiff is the byte offset of the call's argument area from the callee's.
1140   // Stores to callee stack arguments will be placed in FixedStackSlots offset
1141   // by this amount for a tail call. In a sibling call it must be 0 because the
1142   // caller will deallocate the entire stack and the callee still expects its
1143   // arguments to begin at SP+0.
1144   int FPDiff = 0;
1145 
1146   // This will be 0 for sibcalls, potentially nonzero for tail calls produced
1147   // by -tailcallopt. For sibcalls, the memory operands for the call are
1148   // already available in the caller's incoming argument space.
1149   unsigned NumBytes = 0;
1150   if (!IsSibCall) {
1151     // We aren't sibcalling, so we need to compute FPDiff. We need to do this
1152     // before handling assignments, because FPDiff must be known for memory
1153     // arguments.
1154     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1155     SmallVector<CCValAssign, 16> OutLocs;
1156     CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1157 
1158     // FIXME: Not accounting for callee implicit inputs
1159     OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1160     if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
1161       return false;
1162 
1163     // The callee will pop the argument stack as a tail call. Thus, we must
1164     // keep it 16-byte aligned.
1165     NumBytes = alignTo(OutInfo.getNextStackOffset(), ST.getStackAlignment());
1166 
1167     // FPDiff will be negative if this tail call requires more space than we
1168     // would automatically have in our incoming argument space. Positive if we
1169     // actually shrink the stack.
1170     FPDiff = NumReusableBytes - NumBytes;
1171 
1172     // The stack pointer must be 16-byte aligned at all times it's used for a
1173     // memory operation, which in practice means at *all* times and in
1174     // particular across call boundaries. Therefore our own arguments started at
1175     // a 16-byte aligned SP and the delta applied for the tail call should
1176     // satisfy the same constraint.
1177     assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1178            "unaligned stack on tail call");
1179   }
1180 
1181   SmallVector<CCValAssign, 16> ArgLocs;
1182   CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1183 
1184   // We could pass MIB and directly add the implicit uses to the call
1185   // now. However, as an aesthetic choice, place implicit argument operands
1186   // after the ordinary user argument registers.
1187   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1188 
1189   if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
1190       Info.CallConv != CallingConv::AMDGPU_Gfx) {
1191     // With a fixed ABI, allocate fixed registers before user arguments.
1192     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1193       return false;
1194   }
1195 
1196   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1197 
1198   if (!determineAssignments(Assigner, OutArgs, CCInfo))
1199     return false;
1200 
1201   // Do the actual argument marshalling.
1202   AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1203   if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1204     return false;
1205 
1206   handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
1207 
1208   // If we have -tailcallopt, we need to adjust the stack. We'll do the call
1209   // sequence start and end here.
1210   if (!IsSibCall) {
1211     MIB->getOperand(1).setImm(FPDiff);
1212     CallSeqStart.addImm(NumBytes).addImm(0);
1213     // End the call sequence *before* emitting the call. Normally, we would
1214     // tidy the frame up after the call. However, here, we've laid out the
1215     // parameters so that when SP is reset, they will be in the correct
1216     // location.
1217     MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
1218   }
1219 
1220   // Now we can add the actual call instruction to the correct basic block.
1221   MIRBuilder.insertInstr(MIB);
1222 
1223   // If Callee is a reg, since it is used by a target specific
1224   // instruction, it must have a register class matching the
1225   // constraint of that instruction.
1226 
1227   // FIXME: We should define regbankselectable call instructions to handle
1228   // divergent call targets.
1229   if (MIB->getOperand(0).isReg()) {
1230     MIB->getOperand(0).setReg(constrainOperandRegClass(
1231         MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
1232         MIB->getDesc(), MIB->getOperand(0), 0));
1233   }
1234 
1235   MF.getFrameInfo().setHasTailCall();
1236   Info.LoweredTailCall = true;
1237   return true;
1238 }
1239 
lowerCall(MachineIRBuilder & MIRBuilder,CallLoweringInfo & Info) const1240 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1241                                    CallLoweringInfo &Info) const {
1242   if (Info.IsVarArg) {
1243     LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1244     return false;
1245   }
1246 
1247   MachineFunction &MF = MIRBuilder.getMF();
1248   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1249   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1250 
1251   const Function &F = MF.getFunction();
1252   MachineRegisterInfo &MRI = MF.getRegInfo();
1253   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1254   const DataLayout &DL = F.getParent()->getDataLayout();
1255   CallingConv::ID CallConv = F.getCallingConv();
1256 
1257   if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
1258       CallConv != CallingConv::AMDGPU_Gfx) {
1259     LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
1260     return false;
1261   }
1262 
1263   if (AMDGPU::isShader(CallConv)) {
1264     LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
1265     return false;
1266   }
1267 
1268   SmallVector<ArgInfo, 8> OutArgs;
1269   for (auto &OrigArg : Info.OrigArgs)
1270     splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
1271 
1272   SmallVector<ArgInfo, 8> InArgs;
1273   if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1274     splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
1275 
1276   // If we can lower as a tail call, do that instead.
1277   bool CanTailCallOpt =
1278       isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
1279 
1280   // We must emit a tail call if we have musttail.
1281   if (Info.IsMustTailCall && !CanTailCallOpt) {
1282     LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1283     return false;
1284   }
1285 
1286   if (CanTailCallOpt)
1287     return lowerTailCall(MIRBuilder, Info, OutArgs);
1288 
1289   // Find out which ABI gets to decide where things go.
1290   CCAssignFn *AssignFnFixed;
1291   CCAssignFn *AssignFnVarArg;
1292   std::tie(AssignFnFixed, AssignFnVarArg) =
1293       getAssignFnsForCC(Info.CallConv, TLI);
1294 
1295   MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1296     .addImm(0)
1297     .addImm(0);
1298 
1299   // Create a temporarily-floating call instruction so we can add the implicit
1300   // uses of arg registers.
1301   unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
1302 
1303   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1304   MIB.addDef(TRI->getReturnAddressReg(MF));
1305 
1306   if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1307     return false;
1308 
1309   // Tell the call which registers are clobbered.
1310   const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1311   MIB.addRegMask(Mask);
1312 
1313   SmallVector<CCValAssign, 16> ArgLocs;
1314   CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1315 
1316   // We could pass MIB and directly add the implicit uses to the call
1317   // now. However, as an aesthetic choice, place implicit argument operands
1318   // after the ordinary user argument registers.
1319   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1320 
1321   if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
1322       Info.CallConv != CallingConv::AMDGPU_Gfx) {
1323     // With a fixed ABI, allocate fixed registers before user arguments.
1324     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1325       return false;
1326   }
1327 
1328   // Do the actual argument marshalling.
1329   SmallVector<Register, 8> PhysRegs;
1330 
1331   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1332   if (!determineAssignments(Assigner, OutArgs, CCInfo))
1333     return false;
1334 
1335   AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1336   if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1337     return false;
1338 
1339   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1340 
1341   handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
1342 
1343   // Get a count of how many bytes are to be pushed on the stack.
1344   unsigned NumBytes = CCInfo.getNextStackOffset();
1345 
1346   // If Callee is a reg, since it is used by a target specific
1347   // instruction, it must have a register class matching the
1348   // constraint of that instruction.
1349 
1350   // FIXME: We should define regbankselectable call instructions to handle
1351   // divergent call targets.
1352   if (MIB->getOperand(1).isReg()) {
1353     MIB->getOperand(1).setReg(constrainOperandRegClass(
1354         MF, *TRI, MRI, *ST.getInstrInfo(),
1355         *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1356         1));
1357   }
1358 
1359   // Now we can add the actual call instruction to the correct position.
1360   MIRBuilder.insertInstr(MIB);
1361 
1362   // Finally we can copy the returned value back into its virtual-register. In
1363   // symmetry with the arguments, the physical register must be an
1364   // implicit-define of the call instruction.
1365   if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1366     CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1367                                                       Info.IsVarArg);
1368     OutgoingValueAssigner Assigner(RetAssignFn);
1369     CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1370     if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
1371                                        Info.CallConv, Info.IsVarArg))
1372       return false;
1373   }
1374 
1375   uint64_t CalleePopBytes = NumBytes;
1376 
1377   MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
1378             .addImm(0)
1379             .addImm(CalleePopBytes);
1380 
1381   if (!Info.CanLowerReturn) {
1382     insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
1383                     Info.DemoteRegister, Info.DemoteStackIndex);
1384   }
1385 
1386   return true;
1387 }
1388