xref: /llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp (revision 03744d2aaffee04bc1e4d0668c41556c3c20d406)
1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "clang/Basic/TargetOptions.h"
12 #include "llvm/Support/AMDGPUAddrSpace.h"
13 
14 using namespace clang;
15 using namespace clang::CodeGen;
16 
17 //===----------------------------------------------------------------------===//
18 // AMDGPU ABI Implementation
19 //===----------------------------------------------------------------------===//
20 
21 namespace {
22 
23 class AMDGPUABIInfo final : public DefaultABIInfo {
24 private:
25   static const unsigned MaxNumRegsForArgsRet = 16;
26 
27   unsigned numRegsForType(QualType Ty) const;
28 
29   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30   bool isHomogeneousAggregateSmallEnough(const Type *Base,
31                                          uint64_t Members) const override;
32 
33   // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34   llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35                                        unsigned ToAS) const {
36     // Single value types.
37     auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
38     if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39       return llvm::PointerType::get(Ty->getContext(), ToAS);
40     return Ty;
41   }
42 
43 public:
44   explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45     DefaultABIInfo(CGT) {}
46 
47   ABIArgInfo classifyReturnType(QualType RetTy) const;
48   ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
49   ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
50                                   unsigned &NumRegsLeft) const;
51 
52   void computeInfo(CGFunctionInfo &FI) const override;
53   RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54                    AggValueSlot Slot) const override;
55 
56   llvm::FixedVectorType *
57   getOptimalVectorMemoryType(llvm::FixedVectorType *T,
58                              const LangOptions &Opt) const override {
59     // We have legal instructions for 96-bit so 3x32 can be supported.
60     // FIXME: This check should be a subtarget feature as technically SI doesn't
61     // support it.
62     if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
63       return T;
64     return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
65   }
66 };
67 
68 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
69   return true;
70 }
71 
72 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
73   const Type *Base, uint64_t Members) const {
74   uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
75 
76   // Homogeneous Aggregates may occupy at most 16 registers.
77   return Members * NumRegs <= MaxNumRegsForArgsRet;
78 }
79 
80 /// Estimate number of registers the type will use when passed in registers.
81 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
82   unsigned NumRegs = 0;
83 
84   if (const VectorType *VT = Ty->getAs<VectorType>()) {
85     // Compute from the number of elements. The reported size is based on the
86     // in-memory size, which includes the padding 4th element for 3-vectors.
87     QualType EltTy = VT->getElementType();
88     unsigned EltSize = getContext().getTypeSize(EltTy);
89 
90     // 16-bit element vectors should be passed as packed.
91     if (EltSize == 16)
92       return (VT->getNumElements() + 1) / 2;
93 
94     unsigned EltNumRegs = (EltSize + 31) / 32;
95     return EltNumRegs * VT->getNumElements();
96   }
97 
98   if (const RecordType *RT = Ty->getAs<RecordType>()) {
99     const RecordDecl *RD = RT->getDecl();
100     assert(!RD->hasFlexibleArrayMember());
101 
102     for (const FieldDecl *Field : RD->fields()) {
103       QualType FieldTy = Field->getType();
104       NumRegs += numRegsForType(FieldTy);
105     }
106 
107     return NumRegs;
108   }
109 
110   return (getContext().getTypeSize(Ty) + 31) / 32;
111 }
112 
113 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
114   llvm::CallingConv::ID CC = FI.getCallingConvention();
115 
116   if (!getCXXABI().classifyReturnType(FI))
117     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
118 
119   unsigned ArgumentIndex = 0;
120   const unsigned numFixedArguments = FI.getNumRequiredArgs();
121 
122   unsigned NumRegsLeft = MaxNumRegsForArgsRet;
123   for (auto &Arg : FI.arguments()) {
124     if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
125       Arg.info = classifyKernelArgumentType(Arg.type);
126     } else {
127       bool FixedArgument = ArgumentIndex++ < numFixedArguments;
128       Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
129     }
130   }
131 }
132 
133 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
134                                 QualType Ty, AggValueSlot Slot) const {
135   const bool IsIndirect = false;
136   const bool AllowHigherAlign = false;
137   return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
138                           getContext().getTypeInfoInChars(Ty),
139                           CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
140 }
141 
142 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
143   if (isAggregateTypeForABI(RetTy)) {
144     // Records with non-trivial destructors/copy-constructors should not be
145     // returned by value.
146     if (!getRecordArgABI(RetTy, getCXXABI())) {
147       // Ignore empty structs/unions.
148       if (isEmptyRecord(getContext(), RetTy, true))
149         return ABIArgInfo::getIgnore();
150 
151       // Lower single-element structs to just return a regular value.
152       if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
153         return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
154 
155       if (const RecordType *RT = RetTy->getAs<RecordType>()) {
156         const RecordDecl *RD = RT->getDecl();
157         if (RD->hasFlexibleArrayMember())
158           return DefaultABIInfo::classifyReturnType(RetTy);
159       }
160 
161       // Pack aggregates <= 4 bytes into single VGPR or pair.
162       uint64_t Size = getContext().getTypeSize(RetTy);
163       if (Size <= 16)
164         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
165 
166       if (Size <= 32)
167         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
168 
169       if (Size <= 64) {
170         llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
171         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
172       }
173 
174       if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
175         return ABIArgInfo::getDirect();
176     }
177   }
178 
179   // Otherwise just do the default thing.
180   return DefaultABIInfo::classifyReturnType(RetTy);
181 }
182 
183 /// For kernels all parameters are really passed in a special buffer. It doesn't
184 /// make sense to pass anything byval, so everything must be direct.
185 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
186   Ty = useFirstFieldIfTransparentUnion(Ty);
187 
188   // TODO: Can we omit empty structs?
189 
190   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
191     Ty = QualType(SeltTy, 0);
192 
193   llvm::Type *OrigLTy = CGT.ConvertType(Ty);
194   llvm::Type *LTy = OrigLTy;
195   if (getContext().getLangOpts().HIP) {
196     LTy = coerceKernelArgumentType(
197         OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
198         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
199   }
200 
201   // FIXME: Should also use this for OpenCL, but it requires addressing the
202   // problem of kernels being called.
203   //
204   // FIXME: This doesn't apply the optimization of coercing pointers in structs
205   // to global address space when using byref. This would require implementing a
206   // new kind of coercion of the in-memory type when for indirect arguments.
207   if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
208       isAggregateTypeForABI(Ty)) {
209     return ABIArgInfo::getIndirectAliased(
210         getContext().getTypeAlignInChars(Ty),
211         getContext().getTargetAddressSpace(LangAS::opencl_constant),
212         false /*Realign*/, nullptr /*Padding*/);
213   }
214 
215   // If we set CanBeFlattened to true, CodeGen will expand the struct to its
216   // individual elements, which confuses the Clover OpenCL backend; therefore we
217   // have to set it to false here. Other args of getDirect() are just defaults.
218   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
219 }
220 
221 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
222                                                unsigned &NumRegsLeft) const {
223   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
224 
225   Ty = useFirstFieldIfTransparentUnion(Ty);
226 
227   if (Variadic) {
228     return ABIArgInfo::getDirect(/*T=*/nullptr,
229                                  /*Offset=*/0,
230                                  /*Padding=*/nullptr,
231                                  /*CanBeFlattened=*/false,
232                                  /*Align=*/0);
233   }
234 
235   if (isAggregateTypeForABI(Ty)) {
236     // Records with non-trivial destructors/copy-constructors should not be
237     // passed by value.
238     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
239       return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
240 
241     // Ignore empty structs/unions.
242     if (isEmptyRecord(getContext(), Ty, true))
243       return ABIArgInfo::getIgnore();
244 
245     // Lower single-element structs to just pass a regular value. TODO: We
246     // could do reasonable-size multiple-element structs too, using getExpand(),
247     // though watch out for things like bitfields.
248     if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
249       return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
250 
251     if (const RecordType *RT = Ty->getAs<RecordType>()) {
252       const RecordDecl *RD = RT->getDecl();
253       if (RD->hasFlexibleArrayMember())
254         return DefaultABIInfo::classifyArgumentType(Ty);
255     }
256 
257     // Pack aggregates <= 8 bytes into single VGPR or pair.
258     uint64_t Size = getContext().getTypeSize(Ty);
259     if (Size <= 64) {
260       unsigned NumRegs = (Size + 31) / 32;
261       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
262 
263       if (Size <= 16)
264         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
265 
266       if (Size <= 32)
267         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
268 
269       // XXX: Should this be i64 instead, and should the limit increase?
270       llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
271       return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
272     }
273 
274     if (NumRegsLeft > 0) {
275       unsigned NumRegs = numRegsForType(Ty);
276       if (NumRegsLeft >= NumRegs) {
277         NumRegsLeft -= NumRegs;
278         return ABIArgInfo::getDirect();
279       }
280     }
281 
282     // Use pass-by-reference in stead of pass-by-value for struct arguments in
283     // function ABI.
284     return ABIArgInfo::getIndirectAliased(
285         getContext().getTypeAlignInChars(Ty),
286         getContext().getTargetAddressSpace(LangAS::opencl_private));
287   }
288 
289   // Otherwise just do the default thing.
290   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
291   if (!ArgInfo.isIndirect()) {
292     unsigned NumRegs = numRegsForType(Ty);
293     NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
294   }
295 
296   return ArgInfo;
297 }
298 
299 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
300 public:
301   AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
302       : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
303 
304   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
305                                  CodeGenModule &CGM) const;
306 
307   void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
308 
309   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
310                            CodeGen::CodeGenModule &M) const override;
311   unsigned getOpenCLKernelCallingConv() const override;
312 
313   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
314       llvm::PointerType *T, QualType QT) const override;
315 
316   LangAS getASTAllocaAddressSpace() const override {
317     return getLangASFromTargetAS(
318         getABIInfo().getDataLayout().getAllocaAddrSpace());
319   }
320   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
321                                   const VarDecl *D) const override;
322   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
323                                          SyncScope Scope,
324                                          llvm::AtomicOrdering Ordering,
325                                          llvm::LLVMContext &Ctx) const override;
326   void setTargetAtomicMetadata(CodeGenFunction &CGF,
327                                llvm::Instruction &AtomicInst,
328                                const AtomicExpr *Expr = nullptr) const override;
329   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
330                                          llvm::Function *BlockInvokeFunc,
331                                          llvm::Type *BlockTy) const override;
332   bool shouldEmitStaticExternCAliases() const override;
333   bool shouldEmitDWARFBitFieldSeparators() const override;
334   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
335 };
336 }
337 
338 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
339                                               llvm::GlobalValue *GV) {
340   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
341     return false;
342 
343   return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
344          (D->hasAttr<OpenCLKernelAttr>() ||
345           (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
346           (isa<VarDecl>(D) &&
347            (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
348             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
349             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
350 }
351 
352 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
353     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
354   const auto *ReqdWGS =
355       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
356   const bool IsOpenCLKernel =
357       M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
358   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
359 
360   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
361   if (ReqdWGS || FlatWGS) {
362     M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
363   } else if (IsOpenCLKernel || IsHIPKernel) {
364     // By default, restrict the maximum size to a value specified by
365     // --gpu-max-threads-per-block=n or its default value for HIP.
366     const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
367     const unsigned DefaultMaxWorkGroupSize =
368         IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
369                        : M.getLangOpts().GPUMaxThreadsPerBlock;
370     std::string AttrVal =
371         std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
372     F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
373   }
374 
375   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
376     M.handleAMDGPUWavesPerEUAttr(F, Attr);
377 
378   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
379     unsigned NumSGPR = Attr->getNumSGPR();
380 
381     if (NumSGPR != 0)
382       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
383   }
384 
385   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
386     uint32_t NumVGPR = Attr->getNumVGPR();
387 
388     if (NumVGPR != 0)
389       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
390   }
391 
392   if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
393     uint32_t X = Attr->getMaxNumWorkGroupsX()
394                      ->EvaluateKnownConstInt(M.getContext())
395                      .getExtValue();
396     // Y and Z dimensions default to 1 if not specified
397     uint32_t Y = Attr->getMaxNumWorkGroupsY()
398                      ? Attr->getMaxNumWorkGroupsY()
399                            ->EvaluateKnownConstInt(M.getContext())
400                            .getExtValue()
401                      : 1;
402     uint32_t Z = Attr->getMaxNumWorkGroupsZ()
403                      ? Attr->getMaxNumWorkGroupsZ()
404                            ->EvaluateKnownConstInt(M.getContext())
405                            .getExtValue()
406                      : 1;
407 
408     llvm::SmallString<32> AttrVal;
409     llvm::raw_svector_ostream OS(AttrVal);
410     OS << X << ',' << Y << ',' << Z;
411 
412     F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
413   }
414 }
415 
416 /// Emits control constants used to change per-architecture behaviour in the
417 /// AMDGPU ROCm device libraries.
418 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
419     CodeGen::CodeGenModule &CGM) const {
420   StringRef Name = "__oclc_ABI_version";
421   llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
422   if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
423     return;
424 
425   if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
426       llvm::CodeObjectVersionKind::COV_None)
427     return;
428 
429   auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
430   llvm::Constant *COV = llvm::ConstantInt::get(
431       Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
432 
433   // It needs to be constant weak_odr without externally_initialized so that
434   // the load instuction can be eliminated by the IPSCCP.
435   auto *GV = new llvm::GlobalVariable(
436       CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
437       nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
438       CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
439   GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
440   GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
441 
442   // Replace any external references to this variable with the new global.
443   if (OriginalGV) {
444     OriginalGV->replaceAllUsesWith(GV);
445     GV->takeName(OriginalGV);
446     OriginalGV->eraseFromParent();
447   }
448 }
449 
450 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
451     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
452   if (requiresAMDGPUProtectedVisibility(D, GV)) {
453     GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
454     GV->setDSOLocal(true);
455   }
456 
457   if (GV->isDeclaration())
458     return;
459 
460   llvm::Function *F = dyn_cast<llvm::Function>(GV);
461   if (!F)
462     return;
463 
464   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
465   if (FD)
466     setFunctionDeclAttributes(FD, F, M);
467 
468   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
469     F->addFnAttr("amdgpu-ieee", "false");
470 }
471 
472 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
473   return llvm::CallingConv::AMDGPU_KERNEL;
474 }
475 
476 // Currently LLVM assumes null pointers always have value 0,
477 // which results in incorrectly transformed IR. Therefore, instead of
478 // emitting null pointers in private and local address spaces, a null
479 // pointer in generic address space is emitted which is casted to a
480 // pointer in local or private address space.
481 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
482     const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
483     QualType QT) const {
484   if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
485     return llvm::ConstantPointerNull::get(PT);
486 
487   auto &Ctx = CGM.getContext();
488   auto NPT = llvm::PointerType::get(
489       PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
490   return llvm::ConstantExpr::getAddrSpaceCast(
491       llvm::ConstantPointerNull::get(NPT), PT);
492 }
493 
494 LangAS
495 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
496                                                   const VarDecl *D) const {
497   assert(!CGM.getLangOpts().OpenCL &&
498          !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
499          "Address space agnostic languages only");
500   LangAS DefaultGlobalAS = getLangASFromTargetAS(
501       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
502   if (!D)
503     return DefaultGlobalAS;
504 
505   LangAS AddrSpace = D->getType().getAddressSpace();
506   if (AddrSpace != LangAS::Default)
507     return AddrSpace;
508 
509   // Only promote to address space 4 if VarDecl has constant initialization.
510   if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
511       D->hasConstantInitialization()) {
512     if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
513       return *ConstAS;
514   }
515   return DefaultGlobalAS;
516 }
517 
518 llvm::SyncScope::ID
519 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
520                                             SyncScope Scope,
521                                             llvm::AtomicOrdering Ordering,
522                                             llvm::LLVMContext &Ctx) const {
523   std::string Name;
524   switch (Scope) {
525   case SyncScope::HIPSingleThread:
526   case SyncScope::SingleScope:
527     Name = "singlethread";
528     break;
529   case SyncScope::HIPWavefront:
530   case SyncScope::OpenCLSubGroup:
531   case SyncScope::WavefrontScope:
532     Name = "wavefront";
533     break;
534   case SyncScope::HIPWorkgroup:
535   case SyncScope::OpenCLWorkGroup:
536   case SyncScope::WorkgroupScope:
537     Name = "workgroup";
538     break;
539   case SyncScope::HIPAgent:
540   case SyncScope::OpenCLDevice:
541   case SyncScope::DeviceScope:
542     Name = "agent";
543     break;
544   case SyncScope::SystemScope:
545   case SyncScope::HIPSystem:
546   case SyncScope::OpenCLAllSVMDevices:
547     Name = "";
548     break;
549   }
550 
551   // OpenCL assumes by default that atomic scopes are per-address space for
552   // non-sequentially consistent operations.
553   if (Scope >= SyncScope::OpenCLWorkGroup &&
554       Scope <= SyncScope::OpenCLSubGroup &&
555       Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
556     if (!Name.empty())
557       Name = Twine(Twine(Name) + Twine("-")).str();
558 
559     Name = Twine(Twine(Name) + Twine("one-as")).str();
560   }
561 
562   return Ctx.getOrInsertSyncScopeID(Name);
563 }
564 
565 void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
566     CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
567     const AtomicExpr *AE) const {
568   auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
569   auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
570 
571   // OpenCL and old style HIP atomics consider atomics targeting thread private
572   // memory to be undefined.
573   //
574   // TODO: This is probably undefined for atomic load/store, but there's not
575   // much direct codegen benefit to knowing this.
576   if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
577        (CmpX &&
578         CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
579       AE && AE->threadPrivateMemoryAtomicsAreUndefined()) {
580     llvm::MDBuilder MDHelper(CGF.getLLVMContext());
581     llvm::MDNode *ASRange = MDHelper.createRange(
582         llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
583         llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
584     AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
585   }
586 
587   if (!RMW || !CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
588     return;
589 
590   // TODO: Introduce new, more controlled options that also work for integers,
591   // and deprecate allowAMDGPUUnsafeFPAtomics.
592   llvm::AtomicRMWInst::BinOp RMWOp = RMW->getOperation();
593   if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) {
594     llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
595     RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
596 
597     if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW->getType()->isFloatTy())
598       RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
599   }
600 }
601 
602 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
603   return false;
604 }
605 
606 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
607   return true;
608 }
609 
610 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
611     const FunctionType *&FT) const {
612   FT = getABIInfo().getContext().adjustFunctionType(
613       FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
614 }
615 
616 /// Create an OpenCL kernel for an enqueued block.
617 ///
618 /// The type of the first argument (the block literal) is the struct type
619 /// of the block literal instead of a pointer type. The first argument
620 /// (block literal) is passed directly by value to the kernel. The kernel
621 /// allocates the same type of struct on stack and stores the block literal
622 /// to it and passes its pointer to the block invoke function. The kernel
623 /// has "enqueued-block" function attribute and kernel argument metadata.
624 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
625     CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
626   auto &Builder = CGF.Builder;
627   auto &C = CGF.getLLVMContext();
628 
629   auto *InvokeFT = Invoke->getFunctionType();
630   llvm::SmallVector<llvm::Type *, 2> ArgTys;
631   llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
632   llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
633   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
634   llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
635   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
636   llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
637 
638   ArgTys.push_back(BlockTy);
639   ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
640   AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
641   ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
642   ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
643   AccessQuals.push_back(llvm::MDString::get(C, "none"));
644   ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
645   for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
646     ArgTys.push_back(InvokeFT->getParamType(I));
647     ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
648     AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
649     AccessQuals.push_back(llvm::MDString::get(C, "none"));
650     ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
651     ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
652     ArgNames.push_back(
653         llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
654   }
655   std::string Name = Invoke->getName().str() + "_kernel";
656   auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
657   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
658                                    &CGF.CGM.getModule());
659   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
660 
661   llvm::AttrBuilder KernelAttrs(C);
662   // FIXME: The invoke isn't applying the right attributes either
663   // FIXME: This is missing setTargetAttributes
664   CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
665   KernelAttrs.addAttribute("enqueued-block");
666   F->addFnAttrs(KernelAttrs);
667 
668   auto IP = CGF.Builder.saveIP();
669   auto *BB = llvm::BasicBlock::Create(C, "entry", F);
670   Builder.SetInsertPoint(BB);
671   const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
672   auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
673   BlockPtr->setAlignment(BlockAlign);
674   Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
675   auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
676   llvm::SmallVector<llvm::Value *, 2> Args;
677   Args.push_back(Cast);
678   for (llvm::Argument &A : llvm::drop_begin(F->args()))
679     Args.push_back(&A);
680   llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
681   call->setCallingConv(Invoke->getCallingConv());
682   Builder.CreateRetVoid();
683   Builder.restoreIP(IP);
684 
685   F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
686   F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
687   F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
688   F->setMetadata("kernel_arg_base_type",
689                  llvm::MDNode::get(C, ArgBaseTypeNames));
690   F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
691   if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
692     F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
693 
694   return F;
695 }
696 
697 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
698     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
699     const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
700     int32_t *MaxThreadsVal) {
701   unsigned Min = 0;
702   unsigned Max = 0;
703   if (FlatWGS) {
704     Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
705     Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
706   }
707   if (ReqdWGS && Min == 0 && Max == 0)
708     Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
709 
710   if (Min != 0) {
711     assert(Min <= Max && "Min must be less than or equal Max");
712 
713     if (MinThreadsVal)
714       *MinThreadsVal = Min;
715     if (MaxThreadsVal)
716       *MaxThreadsVal = Max;
717     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
718     if (F)
719       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
720   } else
721     assert(Max == 0 && "Max must be zero");
722 }
723 
724 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
725     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
726   unsigned Min =
727       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
728   unsigned Max =
729       Attr->getMax()
730           ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
731           : 0;
732 
733   if (Min != 0) {
734     assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
735 
736     std::string AttrVal = llvm::utostr(Min);
737     if (Max != 0)
738       AttrVal = AttrVal + "," + llvm::utostr(Max);
739     F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
740   } else
741     assert(Max == 0 && "Max must be zero");
742 }
743 
744 std::unique_ptr<TargetCodeGenInfo>
745 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
746   return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
747 }
748