1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/CodeGen/FunctionLoweringInfo.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25
26 #define DEBUG_TYPE "amdgpu-call-lowering"
27
28 using namespace llvm;
29
30 namespace {
31
32 /// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
extendRegisterMin32(CallLowering::ValueHandler & Handler,Register ValVReg,CCValAssign & VA)33 static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
34 Register ValVReg, CCValAssign &VA) {
35 if (VA.getLocVT().getSizeInBits() < 32) {
36 // 16-bit types are reported as legal for 32-bit registers. We need to
37 // extend and do a 32-bit copy to avoid the verifier complaining about it.
38 return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
39 }
40
41 return Handler.extendRegister(ValVReg, VA);
42 }
43
44 struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
AMDGPUOutgoingValueHandler__anonbc34d5ff0111::AMDGPUOutgoingValueHandler45 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
46 MachineInstrBuilder MIB)
47 : OutgoingValueHandler(B, MRI), MIB(MIB) {}
48
49 MachineInstrBuilder MIB;
50
getStackAddress__anonbc34d5ff0111::AMDGPUOutgoingValueHandler51 Register getStackAddress(uint64_t Size, int64_t Offset,
52 MachinePointerInfo &MPO,
53 ISD::ArgFlagsTy Flags) override {
54 llvm_unreachable("not implemented");
55 }
56
assignValueToAddress__anonbc34d5ff0111::AMDGPUOutgoingValueHandler57 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
58 MachinePointerInfo &MPO, CCValAssign &VA) override {
59 llvm_unreachable("not implemented");
60 }
61
assignValueToReg__anonbc34d5ff0111::AMDGPUOutgoingValueHandler62 void assignValueToReg(Register ValVReg, Register PhysReg,
63 CCValAssign &VA) override {
64 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
65
66 // If this is a scalar return, insert a readfirstlane just in case the value
67 // ends up in a VGPR.
68 // FIXME: Assert this is a shader return.
69 const SIRegisterInfo *TRI
70 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
71 if (TRI->isSGPRReg(MRI, PhysReg)) {
72 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
73 {MRI.getType(ExtReg)}, false)
74 .addReg(ExtReg);
75 ExtReg = ToSGPR.getReg(0);
76 }
77
78 MIRBuilder.buildCopy(PhysReg, ExtReg);
79 MIB.addUse(PhysReg, RegState::Implicit);
80 }
81 };
82
83 struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
84 uint64_t StackUsed = 0;
85
AMDGPUIncomingArgHandler__anonbc34d5ff0111::AMDGPUIncomingArgHandler86 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
87 : IncomingValueHandler(B, MRI) {}
88
getStackAddress__anonbc34d5ff0111::AMDGPUIncomingArgHandler89 Register getStackAddress(uint64_t Size, int64_t Offset,
90 MachinePointerInfo &MPO,
91 ISD::ArgFlagsTy Flags) override {
92 auto &MFI = MIRBuilder.getMF().getFrameInfo();
93
94 // Byval is assumed to be writable memory, but other stack passed arguments
95 // are not.
96 const bool IsImmutable = !Flags.isByVal();
97 int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
98 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
99 auto AddrReg = MIRBuilder.buildFrameIndex(
100 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
101 StackUsed = std::max(StackUsed, Size + Offset);
102 return AddrReg.getReg(0);
103 }
104
assignValueToReg__anonbc34d5ff0111::AMDGPUIncomingArgHandler105 void assignValueToReg(Register ValVReg, Register PhysReg,
106 CCValAssign &VA) override {
107 markPhysRegUsed(PhysReg);
108
109 if (VA.getLocVT().getSizeInBits() < 32) {
110 // 16-bit types are reported as legal for 32-bit registers. We need to do
111 // a 32-bit copy, and truncate to avoid the verifier complaining about it.
112 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
113
114 // If we have signext/zeroext, it applies to the whole 32-bit register
115 // before truncation.
116 auto Extended =
117 buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
118 MIRBuilder.buildTrunc(ValVReg, Extended);
119 return;
120 }
121
122 IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
123 }
124
assignValueToAddress__anonbc34d5ff0111::AMDGPUIncomingArgHandler125 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
126 MachinePointerInfo &MPO, CCValAssign &VA) override {
127 MachineFunction &MF = MIRBuilder.getMF();
128
129 // The reported memory location may be wider than the value.
130 const LLT RegTy = MRI.getType(ValVReg);
131 MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
132
133 // FIXME: Get alignment
134 auto MMO = MF.getMachineMemOperand(
135 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
136 inferAlignFromPtrInfo(MF, MPO));
137 MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
138 }
139
140 /// How the physical register gets marked varies between formal
141 /// parameters (it's a basic-block live-in), and a call instruction
142 /// (it's an implicit-def of the BL).
143 virtual void markPhysRegUsed(unsigned PhysReg) = 0;
144 };
145
146 struct FormalArgHandler : public AMDGPUIncomingArgHandler {
FormalArgHandler__anonbc34d5ff0111::FormalArgHandler147 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
148 : AMDGPUIncomingArgHandler(B, MRI) {}
149
markPhysRegUsed__anonbc34d5ff0111::FormalArgHandler150 void markPhysRegUsed(unsigned PhysReg) override {
151 MIRBuilder.getMBB().addLiveIn(PhysReg);
152 }
153 };
154
155 struct CallReturnHandler : public AMDGPUIncomingArgHandler {
CallReturnHandler__anonbc34d5ff0111::CallReturnHandler156 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
157 MachineInstrBuilder MIB)
158 : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
159
markPhysRegUsed__anonbc34d5ff0111::CallReturnHandler160 void markPhysRegUsed(unsigned PhysReg) override {
161 MIB.addDef(PhysReg, RegState::Implicit);
162 }
163
164 MachineInstrBuilder MIB;
165 };
166
167 struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
168 /// For tail calls, the byte offset of the call's argument area from the
169 /// callee's. Unused elsewhere.
170 int FPDiff;
171
172 // Cache the SP register vreg if we need it more than once in this call site.
173 Register SPReg;
174
175 bool IsTailCall;
176
AMDGPUOutgoingArgHandler__anonbc34d5ff0111::AMDGPUOutgoingArgHandler177 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
178 MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
179 bool IsTailCall = false, int FPDiff = 0)
180 : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
181 IsTailCall(IsTailCall) {}
182
getStackAddress__anonbc34d5ff0111::AMDGPUOutgoingArgHandler183 Register getStackAddress(uint64_t Size, int64_t Offset,
184 MachinePointerInfo &MPO,
185 ISD::ArgFlagsTy Flags) override {
186 MachineFunction &MF = MIRBuilder.getMF();
187 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
188 const LLT S32 = LLT::scalar(32);
189
190 if (IsTailCall) {
191 Offset += FPDiff;
192 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
193 auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
194 MPO = MachinePointerInfo::getFixedStack(MF, FI);
195 return FIReg.getReg(0);
196 }
197
198 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
199
200 if (!SPReg)
201 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
202
203 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
204
205 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
206 MPO = MachinePointerInfo::getStack(MF, Offset);
207 return AddrReg.getReg(0);
208 }
209
assignValueToReg__anonbc34d5ff0111::AMDGPUOutgoingArgHandler210 void assignValueToReg(Register ValVReg, Register PhysReg,
211 CCValAssign &VA) override {
212 MIB.addUse(PhysReg, RegState::Implicit);
213 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
214 MIRBuilder.buildCopy(PhysReg, ExtReg);
215 }
216
assignValueToAddress__anonbc34d5ff0111::AMDGPUOutgoingArgHandler217 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
218 MachinePointerInfo &MPO, CCValAssign &VA) override {
219 MachineFunction &MF = MIRBuilder.getMF();
220 uint64_t LocMemOffset = VA.getLocMemOffset();
221 const auto &ST = MF.getSubtarget<GCNSubtarget>();
222
223 auto MMO = MF.getMachineMemOperand(
224 MPO, MachineMemOperand::MOStore, Size,
225 commonAlignment(ST.getStackAlignment(), LocMemOffset));
226 MIRBuilder.buildStore(ValVReg, Addr, *MMO);
227 }
228
assignValueToAddress__anonbc34d5ff0111::AMDGPUOutgoingArgHandler229 void assignValueToAddress(const CallLowering::ArgInfo &Arg,
230 unsigned ValRegIndex, Register Addr,
231 uint64_t MemSize, MachinePointerInfo &MPO,
232 CCValAssign &VA) override {
233 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
234 ? extendRegister(Arg.Regs[ValRegIndex], VA)
235 : Arg.Regs[ValRegIndex];
236
237 // If we extended the value type we might need to adjust the MMO's
238 // Size. This happens if ComputeValueVTs widened a small type value to a
239 // legal register type (e.g. s8->s16)
240 const LLT RegTy = MRI.getType(ValVReg);
241 MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes());
242 assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
243 }
244 };
245 }
246
AMDGPUCallLowering(const AMDGPUTargetLowering & TLI)247 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
248 : CallLowering(&TLI) {
249 }
250
251 // FIXME: Compatability shim
extOpcodeToISDExtOpcode(unsigned MIOpc)252 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
253 switch (MIOpc) {
254 case TargetOpcode::G_SEXT:
255 return ISD::SIGN_EXTEND;
256 case TargetOpcode::G_ZEXT:
257 return ISD::ZERO_EXTEND;
258 case TargetOpcode::G_ANYEXT:
259 return ISD::ANY_EXTEND;
260 default:
261 llvm_unreachable("not an extend opcode");
262 }
263 }
264
canLowerReturn(MachineFunction & MF,CallingConv::ID CallConv,SmallVectorImpl<BaseArgInfo> & Outs,bool IsVarArg) const265 bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
266 CallingConv::ID CallConv,
267 SmallVectorImpl<BaseArgInfo> &Outs,
268 bool IsVarArg) const {
269 // For shaders. Vector types should be explicitly handled by CC.
270 if (AMDGPU::isEntryFunctionCC(CallConv))
271 return true;
272
273 SmallVector<CCValAssign, 16> ArgLocs;
274 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
275 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
276 MF.getFunction().getContext());
277
278 return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
279 }
280
281 /// Lower the return value for the already existing \p Ret. This assumes that
282 /// \p B's insertion point is correct.
lowerReturnVal(MachineIRBuilder & B,const Value * Val,ArrayRef<Register> VRegs,MachineInstrBuilder & Ret) const283 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
284 const Value *Val, ArrayRef<Register> VRegs,
285 MachineInstrBuilder &Ret) const {
286 if (!Val)
287 return true;
288
289 auto &MF = B.getMF();
290 const auto &F = MF.getFunction();
291 const DataLayout &DL = MF.getDataLayout();
292 MachineRegisterInfo *MRI = B.getMRI();
293 LLVMContext &Ctx = F.getContext();
294
295 CallingConv::ID CC = F.getCallingConv();
296 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
297
298 SmallVector<EVT, 8> SplitEVTs;
299 ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
300 assert(VRegs.size() == SplitEVTs.size() &&
301 "For each split Type there should be exactly one VReg.");
302
303 SmallVector<ArgInfo, 8> SplitRetInfos;
304
305 for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
306 EVT VT = SplitEVTs[i];
307 Register Reg = VRegs[i];
308 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx));
309 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
310
311 if (VT.isScalarInteger()) {
312 unsigned ExtendOp = TargetOpcode::G_ANYEXT;
313 if (RetInfo.Flags[0].isSExt()) {
314 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
315 ExtendOp = TargetOpcode::G_SEXT;
316 } else if (RetInfo.Flags[0].isZExt()) {
317 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
318 ExtendOp = TargetOpcode::G_ZEXT;
319 }
320
321 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
322 extOpcodeToISDExtOpcode(ExtendOp));
323 if (ExtVT != VT) {
324 RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
325 LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
326 Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
327 }
328 }
329
330 if (Reg != RetInfo.Regs[0]) {
331 RetInfo.Regs[0] = Reg;
332 // Reset the arg flags after modifying Reg.
333 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
334 }
335
336 splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
337 }
338
339 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
340
341 OutgoingValueAssigner Assigner(AssignFn);
342 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
343 return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
344 CC, F.isVarArg());
345 }
346
lowerReturn(MachineIRBuilder & B,const Value * Val,ArrayRef<Register> VRegs,FunctionLoweringInfo & FLI) const347 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
348 ArrayRef<Register> VRegs,
349 FunctionLoweringInfo &FLI) const {
350
351 MachineFunction &MF = B.getMF();
352 MachineRegisterInfo &MRI = MF.getRegInfo();
353 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
354 MFI->setIfReturnsVoid(!Val);
355
356 assert(!Val == VRegs.empty() && "Return value without a vreg");
357
358 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
359 const bool IsShader = AMDGPU::isShader(CC);
360 const bool IsWaveEnd =
361 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
362 if (IsWaveEnd) {
363 B.buildInstr(AMDGPU::S_ENDPGM)
364 .addImm(0);
365 return true;
366 }
367
368 auto const &ST = MF.getSubtarget<GCNSubtarget>();
369
370 unsigned ReturnOpc =
371 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
372
373 auto Ret = B.buildInstrNoInsert(ReturnOpc);
374 Register ReturnAddrVReg;
375 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
376 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
377 Ret.addUse(ReturnAddrVReg);
378 }
379
380 if (!FLI.CanLowerReturn)
381 insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
382 else if (!lowerReturnVal(B, Val, VRegs, Ret))
383 return false;
384
385 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
386 const SIRegisterInfo *TRI = ST.getRegisterInfo();
387 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
388 &AMDGPU::SGPR_64RegClass);
389 B.buildCopy(ReturnAddrVReg, LiveInReturn);
390 }
391
392 // TODO: Handle CalleeSavedRegsViaCopy.
393
394 B.insertInstr(Ret);
395 return true;
396 }
397
lowerParameterPtr(Register DstReg,MachineIRBuilder & B,Type * ParamTy,uint64_t Offset) const398 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
399 Type *ParamTy,
400 uint64_t Offset) const {
401 MachineFunction &MF = B.getMF();
402 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
403 MachineRegisterInfo &MRI = MF.getRegInfo();
404 Register KernArgSegmentPtr =
405 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
406 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
407
408 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
409
410 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
411 }
412
lowerParameter(MachineIRBuilder & B,Type * ParamTy,uint64_t Offset,Align Alignment,Register DstReg) const413 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
414 uint64_t Offset, Align Alignment,
415 Register DstReg) const {
416 MachineFunction &MF = B.getMF();
417 const Function &F = MF.getFunction();
418 const DataLayout &DL = F.getParent()->getDataLayout();
419 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
420 unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
421
422 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
423 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
424 lowerParameterPtr(PtrReg, B, ParamTy, Offset);
425
426 MachineMemOperand *MMO = MF.getMachineMemOperand(
427 PtrInfo,
428 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
429 MachineMemOperand::MOInvariant,
430 TypeSize, Alignment);
431
432 B.buildLoad(DstReg, PtrReg, *MMO);
433 }
434
435 // Allocate special inputs passed in user SGPRs.
allocateHSAUserSGPRs(CCState & CCInfo,MachineIRBuilder & B,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info)436 static void allocateHSAUserSGPRs(CCState &CCInfo,
437 MachineIRBuilder &B,
438 MachineFunction &MF,
439 const SIRegisterInfo &TRI,
440 SIMachineFunctionInfo &Info) {
441 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
442 if (Info.hasPrivateSegmentBuffer()) {
443 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
444 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
445 CCInfo.AllocateReg(PrivateSegmentBufferReg);
446 }
447
448 if (Info.hasDispatchPtr()) {
449 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
450 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
451 CCInfo.AllocateReg(DispatchPtrReg);
452 }
453
454 if (Info.hasQueuePtr()) {
455 Register QueuePtrReg = Info.addQueuePtr(TRI);
456 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
457 CCInfo.AllocateReg(QueuePtrReg);
458 }
459
460 if (Info.hasKernargSegmentPtr()) {
461 MachineRegisterInfo &MRI = MF.getRegInfo();
462 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
463 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
464 Register VReg = MRI.createGenericVirtualRegister(P4);
465 MRI.addLiveIn(InputPtrReg, VReg);
466 B.getMBB().addLiveIn(InputPtrReg);
467 B.buildCopy(VReg, InputPtrReg);
468 CCInfo.AllocateReg(InputPtrReg);
469 }
470
471 if (Info.hasDispatchID()) {
472 Register DispatchIDReg = Info.addDispatchID(TRI);
473 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
474 CCInfo.AllocateReg(DispatchIDReg);
475 }
476
477 if (Info.hasFlatScratchInit()) {
478 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
479 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
480 CCInfo.AllocateReg(FlatScratchInitReg);
481 }
482
483 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
484 // these from the dispatch pointer.
485 }
486
lowerFormalArgumentsKernel(MachineIRBuilder & B,const Function & F,ArrayRef<ArrayRef<Register>> VRegs) const487 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
488 MachineIRBuilder &B, const Function &F,
489 ArrayRef<ArrayRef<Register>> VRegs) const {
490 MachineFunction &MF = B.getMF();
491 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
492 MachineRegisterInfo &MRI = MF.getRegInfo();
493 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
494 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
495 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
496 const DataLayout &DL = F.getParent()->getDataLayout();
497
498 Info->allocateModuleLDSGlobal(F.getParent());
499
500 SmallVector<CCValAssign, 16> ArgLocs;
501 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
502
503 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
504
505 unsigned i = 0;
506 const Align KernArgBaseAlign(16);
507 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
508 uint64_t ExplicitArgOffset = 0;
509
510 // TODO: Align down to dword alignment and extract bits for extending loads.
511 for (auto &Arg : F.args()) {
512 const bool IsByRef = Arg.hasByRefAttr();
513 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
514 unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
515 if (AllocSize == 0)
516 continue;
517
518 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
519 if (!ABIAlign)
520 ABIAlign = DL.getABITypeAlign(ArgTy);
521
522 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
523 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
524
525 if (Arg.use_empty()) {
526 ++i;
527 continue;
528 }
529
530 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
531
532 if (IsByRef) {
533 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
534
535 assert(VRegs[i].size() == 1 &&
536 "expected only one register for byval pointers");
537 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
538 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
539 } else {
540 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
541 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
542 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
543
544 B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
545 }
546 } else {
547 ArrayRef<Register> OrigArgRegs = VRegs[i];
548 Register ArgReg =
549 OrigArgRegs.size() == 1
550 ? OrigArgRegs[0]
551 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
552
553 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
554 if (OrigArgRegs.size() > 1)
555 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
556 }
557
558 ++i;
559 }
560
561 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
562 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
563 return true;
564 }
565
lowerFormalArguments(MachineIRBuilder & B,const Function & F,ArrayRef<ArrayRef<Register>> VRegs,FunctionLoweringInfo & FLI) const566 bool AMDGPUCallLowering::lowerFormalArguments(
567 MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
568 FunctionLoweringInfo &FLI) const {
569 CallingConv::ID CC = F.getCallingConv();
570
571 // The infrastructure for normal calling convention lowering is essentially
572 // useless for kernels. We want to avoid any kind of legalization or argument
573 // splitting.
574 if (CC == CallingConv::AMDGPU_KERNEL)
575 return lowerFormalArgumentsKernel(B, F, VRegs);
576
577 const bool IsGraphics = AMDGPU::isGraphics(CC);
578 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
579
580 MachineFunction &MF = B.getMF();
581 MachineBasicBlock &MBB = B.getMBB();
582 MachineRegisterInfo &MRI = MF.getRegInfo();
583 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
584 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
585 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
586 const DataLayout &DL = F.getParent()->getDataLayout();
587
588 Info->allocateModuleLDSGlobal(F.getParent());
589
590 SmallVector<CCValAssign, 16> ArgLocs;
591 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
592
593 if (!IsEntryFunc) {
594 Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
595 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
596 &AMDGPU::SGPR_64RegClass);
597 MBB.addLiveIn(ReturnAddrReg);
598 B.buildCopy(LiveInReturn, ReturnAddrReg);
599 }
600
601 if (Info->hasImplicitBufferPtr()) {
602 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
603 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
604 CCInfo.AllocateReg(ImplicitBufferPtrReg);
605 }
606
607 SmallVector<ArgInfo, 32> SplitArgs;
608 unsigned Idx = 0;
609 unsigned PSInputNum = 0;
610
611 // Insert the hidden sret parameter if the return value won't fit in the
612 // return registers.
613 if (!FLI.CanLowerReturn)
614 insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
615
616 for (auto &Arg : F.args()) {
617 if (DL.getTypeStoreSize(Arg.getType()) == 0)
618 continue;
619
620 const bool InReg = Arg.hasAttribute(Attribute::InReg);
621
622 // SGPR arguments to functions not implemented.
623 if (!IsGraphics && InReg)
624 return false;
625
626 if (Arg.hasAttribute(Attribute::SwiftSelf) ||
627 Arg.hasAttribute(Attribute::SwiftError) ||
628 Arg.hasAttribute(Attribute::Nest))
629 return false;
630
631 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
632 const bool ArgUsed = !Arg.use_empty();
633 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
634
635 if (!SkipArg) {
636 Info->markPSInputAllocated(PSInputNum);
637 if (ArgUsed)
638 Info->markPSInputEnabled(PSInputNum);
639 }
640
641 ++PSInputNum;
642
643 if (SkipArg) {
644 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
645 B.buildUndef(VRegs[Idx][I]);
646
647 ++Idx;
648 continue;
649 }
650 }
651
652 ArgInfo OrigArg(VRegs[Idx], Arg);
653 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
654 setArgFlags(OrigArg, OrigArgIdx, DL, F);
655
656 splitToValueTypes(OrigArg, SplitArgs, DL, CC);
657 ++Idx;
658 }
659
660 // At least one interpolation mode must be enabled or else the GPU will
661 // hang.
662 //
663 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
664 // set PSInputAddr, the user wants to enable some bits after the compilation
665 // based on run-time states. Since we can't know what the final PSInputEna
666 // will look like, so we shouldn't do anything here and the user should take
667 // responsibility for the correct programming.
668 //
669 // Otherwise, the following restrictions apply:
670 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
671 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
672 // enabled too.
673 if (CC == CallingConv::AMDGPU_PS) {
674 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
675 ((Info->getPSInputAddr() & 0xF) == 0 &&
676 Info->isPSInputAllocated(11))) {
677 CCInfo.AllocateReg(AMDGPU::VGPR0);
678 CCInfo.AllocateReg(AMDGPU::VGPR1);
679 Info->markPSInputAllocated(0);
680 Info->markPSInputEnabled(0);
681 }
682
683 if (Subtarget.isAmdPalOS()) {
684 // For isAmdPalOS, the user does not enable some bits after compilation
685 // based on run-time states; the register values being generated here are
686 // the final ones set in hardware. Therefore we need to apply the
687 // workaround to PSInputAddr and PSInputEnable together. (The case where
688 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
689 // set up an input arg for a particular interpolation mode, but nothing
690 // uses that input arg. Really we should have an earlier pass that removes
691 // such an arg.)
692 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
693 if ((PsInputBits & 0x7F) == 0 ||
694 ((PsInputBits & 0xF) == 0 &&
695 (PsInputBits >> 11 & 1)))
696 Info->markPSInputEnabled(
697 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
698 }
699 }
700
701 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
702 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
703
704 if (!MBB.empty())
705 B.setInstr(*MBB.begin());
706
707 if (!IsEntryFunc) {
708 // For the fixed ABI, pass workitem IDs in the last argument register.
709 if (AMDGPUTargetMachine::EnableFixedFunctionABI)
710 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
711 }
712
713 IncomingValueAssigner Assigner(AssignFn);
714 if (!determineAssignments(Assigner, SplitArgs, CCInfo))
715 return false;
716
717 FormalArgHandler Handler(B, MRI);
718 if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
719 return false;
720
721 uint64_t StackOffset = Assigner.StackOffset;
722
723 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
724 // Special inputs come after user arguments.
725 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
726 }
727
728 // Start adding system SGPRs.
729 if (IsEntryFunc) {
730 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
731 } else {
732 if (!Subtarget.enableFlatScratch())
733 CCInfo.AllocateReg(Info->getScratchRSrcReg());
734 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
735 }
736
737 // When we tail call, we need to check if the callee's arguments will fit on
738 // the caller's stack. So, whenever we lower formal arguments, we should keep
739 // track of this information, since we might lower a tail call in this
740 // function later.
741 Info->setBytesInStackArgArea(StackOffset);
742
743 // Move back to the end of the basic block.
744 B.setMBB(MBB);
745
746 return true;
747 }
748
passSpecialInputs(MachineIRBuilder & MIRBuilder,CCState & CCInfo,SmallVectorImpl<std::pair<MCRegister,Register>> & ArgRegs,CallLoweringInfo & Info) const749 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
750 CCState &CCInfo,
751 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
752 CallLoweringInfo &Info) const {
753 MachineFunction &MF = MIRBuilder.getMF();
754
755 const AMDGPUFunctionArgInfo *CalleeArgInfo
756 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
757
758 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
759 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
760
761
762 // TODO: Unify with private memory register handling. This is complicated by
763 // the fact that at least in kernels, the input argument is not necessarily
764 // in the same location as the input.
765 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
766 AMDGPUFunctionArgInfo::DISPATCH_PTR,
767 AMDGPUFunctionArgInfo::QUEUE_PTR,
768 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
769 AMDGPUFunctionArgInfo::DISPATCH_ID,
770 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
771 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
772 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
773 };
774
775 MachineRegisterInfo &MRI = MF.getRegInfo();
776
777 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
778 const AMDGPULegalizerInfo *LI
779 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
780
781 for (auto InputID : InputRegs) {
782 const ArgDescriptor *OutgoingArg;
783 const TargetRegisterClass *ArgRC;
784 LLT ArgTy;
785
786 std::tie(OutgoingArg, ArgRC, ArgTy) =
787 CalleeArgInfo->getPreloadedValue(InputID);
788 if (!OutgoingArg)
789 continue;
790
791 const ArgDescriptor *IncomingArg;
792 const TargetRegisterClass *IncomingArgRC;
793 std::tie(IncomingArg, IncomingArgRC, ArgTy) =
794 CallerArgInfo.getPreloadedValue(InputID);
795 assert(IncomingArgRC == ArgRC);
796
797 Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
798
799 if (IncomingArg) {
800 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
801 } else {
802 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
803 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
804 }
805
806 if (OutgoingArg->isRegister()) {
807 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
808 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
809 report_fatal_error("failed to allocate implicit input argument");
810 } else {
811 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
812 return false;
813 }
814 }
815
816 // Pack workitem IDs into a single register or pass it as is if already
817 // packed.
818 const ArgDescriptor *OutgoingArg;
819 const TargetRegisterClass *ArgRC;
820 LLT ArgTy;
821
822 std::tie(OutgoingArg, ArgRC, ArgTy) =
823 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
824 if (!OutgoingArg)
825 std::tie(OutgoingArg, ArgRC, ArgTy) =
826 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
827 if (!OutgoingArg)
828 std::tie(OutgoingArg, ArgRC, ArgTy) =
829 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
830 if (!OutgoingArg)
831 return false;
832
833 auto WorkitemIDX =
834 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
835 auto WorkitemIDY =
836 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
837 auto WorkitemIDZ =
838 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
839
840 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
841 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
842 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
843 const LLT S32 = LLT::scalar(32);
844
845 // If incoming ids are not packed we need to pack them.
846 // FIXME: Should consider known workgroup size to eliminate known 0 cases.
847 Register InputReg;
848 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
849 InputReg = MRI.createGenericVirtualRegister(S32);
850 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
851 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
852 }
853
854 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
855 Register Y = MRI.createGenericVirtualRegister(S32);
856 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
857 std::get<2>(WorkitemIDY));
858
859 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
860 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
861 }
862
863 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
864 Register Z = MRI.createGenericVirtualRegister(S32);
865 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
866 std::get<2>(WorkitemIDZ));
867
868 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
869 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
870 }
871
872 if (!InputReg) {
873 InputReg = MRI.createGenericVirtualRegister(S32);
874
875 // Workitem ids are already packed, any of present incoming arguments will
876 // carry all required fields.
877 ArgDescriptor IncomingArg = ArgDescriptor::createArg(
878 IncomingArgX ? *IncomingArgX :
879 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
880 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
881 &AMDGPU::VGPR_32RegClass, S32);
882 }
883
884 if (OutgoingArg->isRegister()) {
885 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
886 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
887 report_fatal_error("failed to allocate implicit input argument");
888 } else {
889 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
890 return false;
891 }
892
893 return true;
894 }
895
896 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
897 /// CC.
898 static std::pair<CCAssignFn *, CCAssignFn *>
getAssignFnsForCC(CallingConv::ID CC,const SITargetLowering & TLI)899 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
900 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
901 }
902
getCallOpcode(const MachineFunction & CallerF,bool IsIndirect,bool IsTailCall)903 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
904 bool IsTailCall) {
905 return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::SI_CALL;
906 }
907
908 // Add operands to call instruction to track the callee.
addCallTargetOperands(MachineInstrBuilder & CallInst,MachineIRBuilder & MIRBuilder,AMDGPUCallLowering::CallLoweringInfo & Info)909 static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
910 MachineIRBuilder &MIRBuilder,
911 AMDGPUCallLowering::CallLoweringInfo &Info) {
912 if (Info.Callee.isReg()) {
913 CallInst.addReg(Info.Callee.getReg());
914 CallInst.addImm(0);
915 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
916 // The call lowering lightly assumed we can directly encode a call target in
917 // the instruction, which is not the case. Materialize the address here.
918 const GlobalValue *GV = Info.Callee.getGlobal();
919 auto Ptr = MIRBuilder.buildGlobalValue(
920 LLT::pointer(GV->getAddressSpace(), 64), GV);
921 CallInst.addReg(Ptr.getReg(0));
922 CallInst.add(Info.Callee);
923 } else
924 return false;
925
926 return true;
927 }
928
doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo & Info,MachineFunction & MF,SmallVectorImpl<ArgInfo> & InArgs) const929 bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
930 CallLoweringInfo &Info, MachineFunction &MF,
931 SmallVectorImpl<ArgInfo> &InArgs) const {
932 const Function &CallerF = MF.getFunction();
933 CallingConv::ID CalleeCC = Info.CallConv;
934 CallingConv::ID CallerCC = CallerF.getCallingConv();
935
936 // If the calling conventions match, then everything must be the same.
937 if (CalleeCC == CallerCC)
938 return true;
939
940 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
941
942 // Make sure that the caller and callee preserve all of the same registers.
943 auto TRI = ST.getRegisterInfo();
944
945 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
946 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
947 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
948 return false;
949
950 // Check if the caller and callee will handle arguments in the same way.
951 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
952 CCAssignFn *CalleeAssignFnFixed;
953 CCAssignFn *CalleeAssignFnVarArg;
954 std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
955 getAssignFnsForCC(CalleeCC, TLI);
956
957 CCAssignFn *CallerAssignFnFixed;
958 CCAssignFn *CallerAssignFnVarArg;
959 std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
960 getAssignFnsForCC(CallerCC, TLI);
961
962 // FIXME: We are not accounting for potential differences in implicitly passed
963 // inputs, but only the fixed ABI is supported now anyway.
964 IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
965 CalleeAssignFnVarArg);
966 IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
967 CallerAssignFnVarArg);
968 return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
969 }
970
areCalleeOutgoingArgsTailCallable(CallLoweringInfo & Info,MachineFunction & MF,SmallVectorImpl<ArgInfo> & OutArgs) const971 bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
972 CallLoweringInfo &Info, MachineFunction &MF,
973 SmallVectorImpl<ArgInfo> &OutArgs) const {
974 // If there are no outgoing arguments, then we are done.
975 if (OutArgs.empty())
976 return true;
977
978 const Function &CallerF = MF.getFunction();
979 CallingConv::ID CalleeCC = Info.CallConv;
980 CallingConv::ID CallerCC = CallerF.getCallingConv();
981 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
982
983 CCAssignFn *AssignFnFixed;
984 CCAssignFn *AssignFnVarArg;
985 std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
986
987 // We have outgoing arguments. Make sure that we can tail call with them.
988 SmallVector<CCValAssign, 16> OutLocs;
989 CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
990 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
991
992 if (!determineAssignments(Assigner, OutArgs, OutInfo)) {
993 LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
994 return false;
995 }
996
997 // Make sure that they can fit on the caller's stack.
998 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
999 if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
1000 LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1001 return false;
1002 }
1003
1004 // Verify that the parameters in callee-saved registers match.
1005 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1006 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1007 const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1008 MachineRegisterInfo &MRI = MF.getRegInfo();
1009 return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
1010 }
1011
1012 /// Return true if the calling convention is one that we can guarantee TCO for.
canGuaranteeTCO(CallingConv::ID CC)1013 static bool canGuaranteeTCO(CallingConv::ID CC) {
1014 return CC == CallingConv::Fast;
1015 }
1016
1017 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)1018 static bool mayTailCallThisCC(CallingConv::ID CC) {
1019 switch (CC) {
1020 case CallingConv::C:
1021 case CallingConv::AMDGPU_Gfx:
1022 return true;
1023 default:
1024 return canGuaranteeTCO(CC);
1025 }
1026 }
1027
isEligibleForTailCallOptimization(MachineIRBuilder & B,CallLoweringInfo & Info,SmallVectorImpl<ArgInfo> & InArgs,SmallVectorImpl<ArgInfo> & OutArgs) const1028 bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1029 MachineIRBuilder &B, CallLoweringInfo &Info,
1030 SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1031 // Must pass all target-independent checks in order to tail call optimize.
1032 if (!Info.IsTailCall)
1033 return false;
1034
1035 MachineFunction &MF = B.getMF();
1036 const Function &CallerF = MF.getFunction();
1037 CallingConv::ID CalleeCC = Info.CallConv;
1038 CallingConv::ID CallerCC = CallerF.getCallingConv();
1039
1040 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1041 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1042 // Kernels aren't callable, and don't have a live in return address so it
1043 // doesn't make sense to do a tail call with entry functions.
1044 if (!CallerPreserved)
1045 return false;
1046
1047 if (!mayTailCallThisCC(CalleeCC)) {
1048 LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1049 return false;
1050 }
1051
1052 if (any_of(CallerF.args(), [](const Argument &A) {
1053 return A.hasByValAttr() || A.hasSwiftErrorAttr();
1054 })) {
1055 LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1056 "or swifterror arguments\n");
1057 return false;
1058 }
1059
1060 // If we have -tailcallopt, then we're done.
1061 if (MF.getTarget().Options.GuaranteedTailCallOpt)
1062 return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
1063
1064 // Verify that the incoming and outgoing arguments from the callee are
1065 // safe to tail call.
1066 if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1067 LLVM_DEBUG(
1068 dbgs()
1069 << "... Caller and callee have incompatible calling conventions.\n");
1070 return false;
1071 }
1072
1073 if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1074 return false;
1075
1076 LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1077 return true;
1078 }
1079
1080 // Insert outgoing implicit arguments for a call, by inserting copies to the
1081 // implicit argument registers and adding the necessary implicit uses to the
1082 // call instruction.
handleImplicitCallArguments(MachineIRBuilder & MIRBuilder,MachineInstrBuilder & CallInst,const GCNSubtarget & ST,const SIMachineFunctionInfo & FuncInfo,ArrayRef<std::pair<MCRegister,Register>> ImplicitArgRegs) const1083 void AMDGPUCallLowering::handleImplicitCallArguments(
1084 MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
1085 const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1086 ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1087 if (!ST.enableFlatScratch()) {
1088 // Insert copies for the SRD. In the HSA case, this should be an identity
1089 // copy.
1090 auto ScratchRSrcReg =
1091 MIRBuilder.buildCopy(LLT::vector(4, 32), FuncInfo.getScratchRSrcReg());
1092 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
1093 CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
1094 }
1095
1096 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1097 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1098 CallInst.addReg(ArgReg.first, RegState::Implicit);
1099 }
1100 }
1101
lowerTailCall(MachineIRBuilder & MIRBuilder,CallLoweringInfo & Info,SmallVectorImpl<ArgInfo> & OutArgs) const1102 bool AMDGPUCallLowering::lowerTailCall(
1103 MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1104 SmallVectorImpl<ArgInfo> &OutArgs) const {
1105 MachineFunction &MF = MIRBuilder.getMF();
1106 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1107 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1108 const Function &F = MF.getFunction();
1109 MachineRegisterInfo &MRI = MF.getRegInfo();
1110 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1111
1112 // True when we're tail calling, but without -tailcallopt.
1113 bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1114
1115 // Find out which ABI gets to decide where things go.
1116 CallingConv::ID CalleeCC = Info.CallConv;
1117 CCAssignFn *AssignFnFixed;
1118 CCAssignFn *AssignFnVarArg;
1119 std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
1120
1121 MachineInstrBuilder CallSeqStart;
1122 if (!IsSibCall)
1123 CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
1124
1125 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
1126 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1127 if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1128 return false;
1129
1130 // Byte offset for the tail call. When we are sibcalling, this will always
1131 // be 0.
1132 MIB.addImm(0);
1133
1134 // Tell the call which registers are clobbered.
1135 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1136 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1137 MIB.addRegMask(Mask);
1138
1139 // FPDiff is the byte offset of the call's argument area from the callee's.
1140 // Stores to callee stack arguments will be placed in FixedStackSlots offset
1141 // by this amount for a tail call. In a sibling call it must be 0 because the
1142 // caller will deallocate the entire stack and the callee still expects its
1143 // arguments to begin at SP+0.
1144 int FPDiff = 0;
1145
1146 // This will be 0 for sibcalls, potentially nonzero for tail calls produced
1147 // by -tailcallopt. For sibcalls, the memory operands for the call are
1148 // already available in the caller's incoming argument space.
1149 unsigned NumBytes = 0;
1150 if (!IsSibCall) {
1151 // We aren't sibcalling, so we need to compute FPDiff. We need to do this
1152 // before handling assignments, because FPDiff must be known for memory
1153 // arguments.
1154 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1155 SmallVector<CCValAssign, 16> OutLocs;
1156 CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1157
1158 // FIXME: Not accounting for callee implicit inputs
1159 OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1160 if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
1161 return false;
1162
1163 // The callee will pop the argument stack as a tail call. Thus, we must
1164 // keep it 16-byte aligned.
1165 NumBytes = alignTo(OutInfo.getNextStackOffset(), ST.getStackAlignment());
1166
1167 // FPDiff will be negative if this tail call requires more space than we
1168 // would automatically have in our incoming argument space. Positive if we
1169 // actually shrink the stack.
1170 FPDiff = NumReusableBytes - NumBytes;
1171
1172 // The stack pointer must be 16-byte aligned at all times it's used for a
1173 // memory operation, which in practice means at *all* times and in
1174 // particular across call boundaries. Therefore our own arguments started at
1175 // a 16-byte aligned SP and the delta applied for the tail call should
1176 // satisfy the same constraint.
1177 assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1178 "unaligned stack on tail call");
1179 }
1180
1181 SmallVector<CCValAssign, 16> ArgLocs;
1182 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1183
1184 // We could pass MIB and directly add the implicit uses to the call
1185 // now. However, as an aesthetic choice, place implicit argument operands
1186 // after the ordinary user argument registers.
1187 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1188
1189 if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
1190 Info.CallConv != CallingConv::AMDGPU_Gfx) {
1191 // With a fixed ABI, allocate fixed registers before user arguments.
1192 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1193 return false;
1194 }
1195
1196 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1197
1198 if (!determineAssignments(Assigner, OutArgs, CCInfo))
1199 return false;
1200
1201 // Do the actual argument marshalling.
1202 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1203 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1204 return false;
1205
1206 handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
1207
1208 // If we have -tailcallopt, we need to adjust the stack. We'll do the call
1209 // sequence start and end here.
1210 if (!IsSibCall) {
1211 MIB->getOperand(1).setImm(FPDiff);
1212 CallSeqStart.addImm(NumBytes).addImm(0);
1213 // End the call sequence *before* emitting the call. Normally, we would
1214 // tidy the frame up after the call. However, here, we've laid out the
1215 // parameters so that when SP is reset, they will be in the correct
1216 // location.
1217 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
1218 }
1219
1220 // Now we can add the actual call instruction to the correct basic block.
1221 MIRBuilder.insertInstr(MIB);
1222
1223 // If Callee is a reg, since it is used by a target specific
1224 // instruction, it must have a register class matching the
1225 // constraint of that instruction.
1226
1227 // FIXME: We should define regbankselectable call instructions to handle
1228 // divergent call targets.
1229 if (MIB->getOperand(0).isReg()) {
1230 MIB->getOperand(0).setReg(constrainOperandRegClass(
1231 MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
1232 MIB->getDesc(), MIB->getOperand(0), 0));
1233 }
1234
1235 MF.getFrameInfo().setHasTailCall();
1236 Info.LoweredTailCall = true;
1237 return true;
1238 }
1239
lowerCall(MachineIRBuilder & MIRBuilder,CallLoweringInfo & Info) const1240 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1241 CallLoweringInfo &Info) const {
1242 if (Info.IsVarArg) {
1243 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1244 return false;
1245 }
1246
1247 MachineFunction &MF = MIRBuilder.getMF();
1248 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1249 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1250
1251 const Function &F = MF.getFunction();
1252 MachineRegisterInfo &MRI = MF.getRegInfo();
1253 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1254 const DataLayout &DL = F.getParent()->getDataLayout();
1255 CallingConv::ID CallConv = F.getCallingConv();
1256
1257 if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
1258 CallConv != CallingConv::AMDGPU_Gfx) {
1259 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
1260 return false;
1261 }
1262
1263 if (AMDGPU::isShader(CallConv)) {
1264 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
1265 return false;
1266 }
1267
1268 SmallVector<ArgInfo, 8> OutArgs;
1269 for (auto &OrigArg : Info.OrigArgs)
1270 splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
1271
1272 SmallVector<ArgInfo, 8> InArgs;
1273 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1274 splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
1275
1276 // If we can lower as a tail call, do that instead.
1277 bool CanTailCallOpt =
1278 isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
1279
1280 // We must emit a tail call if we have musttail.
1281 if (Info.IsMustTailCall && !CanTailCallOpt) {
1282 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1283 return false;
1284 }
1285
1286 if (CanTailCallOpt)
1287 return lowerTailCall(MIRBuilder, Info, OutArgs);
1288
1289 // Find out which ABI gets to decide where things go.
1290 CCAssignFn *AssignFnFixed;
1291 CCAssignFn *AssignFnVarArg;
1292 std::tie(AssignFnFixed, AssignFnVarArg) =
1293 getAssignFnsForCC(Info.CallConv, TLI);
1294
1295 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1296 .addImm(0)
1297 .addImm(0);
1298
1299 // Create a temporarily-floating call instruction so we can add the implicit
1300 // uses of arg registers.
1301 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
1302
1303 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1304 MIB.addDef(TRI->getReturnAddressReg(MF));
1305
1306 if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1307 return false;
1308
1309 // Tell the call which registers are clobbered.
1310 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1311 MIB.addRegMask(Mask);
1312
1313 SmallVector<CCValAssign, 16> ArgLocs;
1314 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1315
1316 // We could pass MIB and directly add the implicit uses to the call
1317 // now. However, as an aesthetic choice, place implicit argument operands
1318 // after the ordinary user argument registers.
1319 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1320
1321 if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
1322 Info.CallConv != CallingConv::AMDGPU_Gfx) {
1323 // With a fixed ABI, allocate fixed registers before user arguments.
1324 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1325 return false;
1326 }
1327
1328 // Do the actual argument marshalling.
1329 SmallVector<Register, 8> PhysRegs;
1330
1331 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1332 if (!determineAssignments(Assigner, OutArgs, CCInfo))
1333 return false;
1334
1335 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1336 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1337 return false;
1338
1339 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1340
1341 handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
1342
1343 // Get a count of how many bytes are to be pushed on the stack.
1344 unsigned NumBytes = CCInfo.getNextStackOffset();
1345
1346 // If Callee is a reg, since it is used by a target specific
1347 // instruction, it must have a register class matching the
1348 // constraint of that instruction.
1349
1350 // FIXME: We should define regbankselectable call instructions to handle
1351 // divergent call targets.
1352 if (MIB->getOperand(1).isReg()) {
1353 MIB->getOperand(1).setReg(constrainOperandRegClass(
1354 MF, *TRI, MRI, *ST.getInstrInfo(),
1355 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1356 1));
1357 }
1358
1359 // Now we can add the actual call instruction to the correct position.
1360 MIRBuilder.insertInstr(MIB);
1361
1362 // Finally we can copy the returned value back into its virtual-register. In
1363 // symmetry with the arguments, the physical register must be an
1364 // implicit-define of the call instruction.
1365 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1366 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1367 Info.IsVarArg);
1368 OutgoingValueAssigner Assigner(RetAssignFn);
1369 CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1370 if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
1371 Info.CallConv, Info.IsVarArg))
1372 return false;
1373 }
1374
1375 uint64_t CalleePopBytes = NumBytes;
1376
1377 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
1378 .addImm(0)
1379 .addImm(CalleePopBytes);
1380
1381 if (!Info.CanLowerReturn) {
1382 insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
1383 Info.DemoteRegister, Info.DemoteStackIndex);
1384 }
1385
1386 return true;
1387 }
1388