1*06c3fb27SDimitry Andric //===- AMDGPU.cpp ---------------------------------------------------------===// 2*06c3fb27SDimitry Andric // 3*06c3fb27SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*06c3fb27SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*06c3fb27SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*06c3fb27SDimitry Andric // 7*06c3fb27SDimitry Andric //===----------------------------------------------------------------------===// 8*06c3fb27SDimitry Andric 9*06c3fb27SDimitry Andric #include "ABIInfoImpl.h" 10*06c3fb27SDimitry Andric #include "TargetInfo.h" 11*06c3fb27SDimitry Andric 12*06c3fb27SDimitry Andric using namespace clang; 13*06c3fb27SDimitry Andric using namespace clang::CodeGen; 14*06c3fb27SDimitry Andric 15*06c3fb27SDimitry Andric //===----------------------------------------------------------------------===// 16*06c3fb27SDimitry Andric // AMDGPU ABI Implementation 17*06c3fb27SDimitry Andric //===----------------------------------------------------------------------===// 18*06c3fb27SDimitry Andric 19*06c3fb27SDimitry Andric namespace { 20*06c3fb27SDimitry Andric 21*06c3fb27SDimitry Andric class AMDGPUABIInfo final : public DefaultABIInfo { 22*06c3fb27SDimitry Andric private: 23*06c3fb27SDimitry Andric static const unsigned MaxNumRegsForArgsRet = 16; 24*06c3fb27SDimitry Andric 25*06c3fb27SDimitry Andric unsigned numRegsForType(QualType Ty) const; 26*06c3fb27SDimitry Andric 27*06c3fb27SDimitry Andric bool isHomogeneousAggregateBaseType(QualType Ty) const override; 28*06c3fb27SDimitry Andric bool isHomogeneousAggregateSmallEnough(const Type *Base, 29*06c3fb27SDimitry Andric uint64_t Members) const override; 30*06c3fb27SDimitry Andric 31*06c3fb27SDimitry Andric // Coerce HIP scalar pointer arguments from generic pointers to global ones. 32*06c3fb27SDimitry Andric llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, 33*06c3fb27SDimitry Andric unsigned ToAS) const { 34*06c3fb27SDimitry Andric // Single value types. 35*06c3fb27SDimitry Andric auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty); 36*06c3fb27SDimitry Andric if (PtrTy && PtrTy->getAddressSpace() == FromAS) 37*06c3fb27SDimitry Andric return llvm::PointerType::get(Ty->getContext(), ToAS); 38*06c3fb27SDimitry Andric return Ty; 39*06c3fb27SDimitry Andric } 40*06c3fb27SDimitry Andric 41*06c3fb27SDimitry Andric public: 42*06c3fb27SDimitry Andric explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : 43*06c3fb27SDimitry Andric DefaultABIInfo(CGT) {} 44*06c3fb27SDimitry Andric 45*06c3fb27SDimitry Andric ABIArgInfo classifyReturnType(QualType RetTy) const; 46*06c3fb27SDimitry Andric ABIArgInfo classifyKernelArgumentType(QualType Ty) const; 47*06c3fb27SDimitry Andric ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const; 48*06c3fb27SDimitry Andric 49*06c3fb27SDimitry Andric void computeInfo(CGFunctionInfo &FI) const override; 50*06c3fb27SDimitry Andric Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, 51*06c3fb27SDimitry Andric QualType Ty) const override; 52*06c3fb27SDimitry Andric }; 53*06c3fb27SDimitry Andric 54*06c3fb27SDimitry Andric bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { 55*06c3fb27SDimitry Andric return true; 56*06c3fb27SDimitry Andric } 57*06c3fb27SDimitry Andric 58*06c3fb27SDimitry Andric bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough( 59*06c3fb27SDimitry Andric const Type *Base, uint64_t Members) const { 60*06c3fb27SDimitry Andric uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32; 61*06c3fb27SDimitry Andric 62*06c3fb27SDimitry Andric // Homogeneous Aggregates may occupy at most 16 registers. 63*06c3fb27SDimitry Andric return Members * NumRegs <= MaxNumRegsForArgsRet; 64*06c3fb27SDimitry Andric } 65*06c3fb27SDimitry Andric 66*06c3fb27SDimitry Andric /// Estimate number of registers the type will use when passed in registers. 67*06c3fb27SDimitry Andric unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const { 68*06c3fb27SDimitry Andric unsigned NumRegs = 0; 69*06c3fb27SDimitry Andric 70*06c3fb27SDimitry Andric if (const VectorType *VT = Ty->getAs<VectorType>()) { 71*06c3fb27SDimitry Andric // Compute from the number of elements. The reported size is based on the 72*06c3fb27SDimitry Andric // in-memory size, which includes the padding 4th element for 3-vectors. 73*06c3fb27SDimitry Andric QualType EltTy = VT->getElementType(); 74*06c3fb27SDimitry Andric unsigned EltSize = getContext().getTypeSize(EltTy); 75*06c3fb27SDimitry Andric 76*06c3fb27SDimitry Andric // 16-bit element vectors should be passed as packed. 77*06c3fb27SDimitry Andric if (EltSize == 16) 78*06c3fb27SDimitry Andric return (VT->getNumElements() + 1) / 2; 79*06c3fb27SDimitry Andric 80*06c3fb27SDimitry Andric unsigned EltNumRegs = (EltSize + 31) / 32; 81*06c3fb27SDimitry Andric return EltNumRegs * VT->getNumElements(); 82*06c3fb27SDimitry Andric } 83*06c3fb27SDimitry Andric 84*06c3fb27SDimitry Andric if (const RecordType *RT = Ty->getAs<RecordType>()) { 85*06c3fb27SDimitry Andric const RecordDecl *RD = RT->getDecl(); 86*06c3fb27SDimitry Andric assert(!RD->hasFlexibleArrayMember()); 87*06c3fb27SDimitry Andric 88*06c3fb27SDimitry Andric for (const FieldDecl *Field : RD->fields()) { 89*06c3fb27SDimitry Andric QualType FieldTy = Field->getType(); 90*06c3fb27SDimitry Andric NumRegs += numRegsForType(FieldTy); 91*06c3fb27SDimitry Andric } 92*06c3fb27SDimitry Andric 93*06c3fb27SDimitry Andric return NumRegs; 94*06c3fb27SDimitry Andric } 95*06c3fb27SDimitry Andric 96*06c3fb27SDimitry Andric return (getContext().getTypeSize(Ty) + 31) / 32; 97*06c3fb27SDimitry Andric } 98*06c3fb27SDimitry Andric 99*06c3fb27SDimitry Andric void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { 100*06c3fb27SDimitry Andric llvm::CallingConv::ID CC = FI.getCallingConvention(); 101*06c3fb27SDimitry Andric 102*06c3fb27SDimitry Andric if (!getCXXABI().classifyReturnType(FI)) 103*06c3fb27SDimitry Andric FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); 104*06c3fb27SDimitry Andric 105*06c3fb27SDimitry Andric unsigned NumRegsLeft = MaxNumRegsForArgsRet; 106*06c3fb27SDimitry Andric for (auto &Arg : FI.arguments()) { 107*06c3fb27SDimitry Andric if (CC == llvm::CallingConv::AMDGPU_KERNEL) { 108*06c3fb27SDimitry Andric Arg.info = classifyKernelArgumentType(Arg.type); 109*06c3fb27SDimitry Andric } else { 110*06c3fb27SDimitry Andric Arg.info = classifyArgumentType(Arg.type, NumRegsLeft); 111*06c3fb27SDimitry Andric } 112*06c3fb27SDimitry Andric } 113*06c3fb27SDimitry Andric } 114*06c3fb27SDimitry Andric 115*06c3fb27SDimitry Andric Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, 116*06c3fb27SDimitry Andric QualType Ty) const { 117*06c3fb27SDimitry Andric llvm_unreachable("AMDGPU does not support varargs"); 118*06c3fb27SDimitry Andric } 119*06c3fb27SDimitry Andric 120*06c3fb27SDimitry Andric ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { 121*06c3fb27SDimitry Andric if (isAggregateTypeForABI(RetTy)) { 122*06c3fb27SDimitry Andric // Records with non-trivial destructors/copy-constructors should not be 123*06c3fb27SDimitry Andric // returned by value. 124*06c3fb27SDimitry Andric if (!getRecordArgABI(RetTy, getCXXABI())) { 125*06c3fb27SDimitry Andric // Ignore empty structs/unions. 126*06c3fb27SDimitry Andric if (isEmptyRecord(getContext(), RetTy, true)) 127*06c3fb27SDimitry Andric return ABIArgInfo::getIgnore(); 128*06c3fb27SDimitry Andric 129*06c3fb27SDimitry Andric // Lower single-element structs to just return a regular value. 130*06c3fb27SDimitry Andric if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) 131*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 132*06c3fb27SDimitry Andric 133*06c3fb27SDimitry Andric if (const RecordType *RT = RetTy->getAs<RecordType>()) { 134*06c3fb27SDimitry Andric const RecordDecl *RD = RT->getDecl(); 135*06c3fb27SDimitry Andric if (RD->hasFlexibleArrayMember()) 136*06c3fb27SDimitry Andric return DefaultABIInfo::classifyReturnType(RetTy); 137*06c3fb27SDimitry Andric } 138*06c3fb27SDimitry Andric 139*06c3fb27SDimitry Andric // Pack aggregates <= 4 bytes into single VGPR or pair. 140*06c3fb27SDimitry Andric uint64_t Size = getContext().getTypeSize(RetTy); 141*06c3fb27SDimitry Andric if (Size <= 16) 142*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 143*06c3fb27SDimitry Andric 144*06c3fb27SDimitry Andric if (Size <= 32) 145*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 146*06c3fb27SDimitry Andric 147*06c3fb27SDimitry Andric if (Size <= 64) { 148*06c3fb27SDimitry Andric llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 149*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 150*06c3fb27SDimitry Andric } 151*06c3fb27SDimitry Andric 152*06c3fb27SDimitry Andric if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) 153*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(); 154*06c3fb27SDimitry Andric } 155*06c3fb27SDimitry Andric } 156*06c3fb27SDimitry Andric 157*06c3fb27SDimitry Andric // Otherwise just do the default thing. 158*06c3fb27SDimitry Andric return DefaultABIInfo::classifyReturnType(RetTy); 159*06c3fb27SDimitry Andric } 160*06c3fb27SDimitry Andric 161*06c3fb27SDimitry Andric /// For kernels all parameters are really passed in a special buffer. It doesn't 162*06c3fb27SDimitry Andric /// make sense to pass anything byval, so everything must be direct. 163*06c3fb27SDimitry Andric ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { 164*06c3fb27SDimitry Andric Ty = useFirstFieldIfTransparentUnion(Ty); 165*06c3fb27SDimitry Andric 166*06c3fb27SDimitry Andric // TODO: Can we omit empty structs? 167*06c3fb27SDimitry Andric 168*06c3fb27SDimitry Andric if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 169*06c3fb27SDimitry Andric Ty = QualType(SeltTy, 0); 170*06c3fb27SDimitry Andric 171*06c3fb27SDimitry Andric llvm::Type *OrigLTy = CGT.ConvertType(Ty); 172*06c3fb27SDimitry Andric llvm::Type *LTy = OrigLTy; 173*06c3fb27SDimitry Andric if (getContext().getLangOpts().HIP) { 174*06c3fb27SDimitry Andric LTy = coerceKernelArgumentType( 175*06c3fb27SDimitry Andric OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), 176*06c3fb27SDimitry Andric /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); 177*06c3fb27SDimitry Andric } 178*06c3fb27SDimitry Andric 179*06c3fb27SDimitry Andric // FIXME: Should also use this for OpenCL, but it requires addressing the 180*06c3fb27SDimitry Andric // problem of kernels being called. 181*06c3fb27SDimitry Andric // 182*06c3fb27SDimitry Andric // FIXME: This doesn't apply the optimization of coercing pointers in structs 183*06c3fb27SDimitry Andric // to global address space when using byref. This would require implementing a 184*06c3fb27SDimitry Andric // new kind of coercion of the in-memory type when for indirect arguments. 185*06c3fb27SDimitry Andric if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && 186*06c3fb27SDimitry Andric isAggregateTypeForABI(Ty)) { 187*06c3fb27SDimitry Andric return ABIArgInfo::getIndirectAliased( 188*06c3fb27SDimitry Andric getContext().getTypeAlignInChars(Ty), 189*06c3fb27SDimitry Andric getContext().getTargetAddressSpace(LangAS::opencl_constant), 190*06c3fb27SDimitry Andric false /*Realign*/, nullptr /*Padding*/); 191*06c3fb27SDimitry Andric } 192*06c3fb27SDimitry Andric 193*06c3fb27SDimitry Andric // If we set CanBeFlattened to true, CodeGen will expand the struct to its 194*06c3fb27SDimitry Andric // individual elements, which confuses the Clover OpenCL backend; therefore we 195*06c3fb27SDimitry Andric // have to set it to false here. Other args of getDirect() are just defaults. 196*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(LTy, 0, nullptr, false); 197*06c3fb27SDimitry Andric } 198*06c3fb27SDimitry Andric 199*06c3fb27SDimitry Andric ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, 200*06c3fb27SDimitry Andric unsigned &NumRegsLeft) const { 201*06c3fb27SDimitry Andric assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow"); 202*06c3fb27SDimitry Andric 203*06c3fb27SDimitry Andric Ty = useFirstFieldIfTransparentUnion(Ty); 204*06c3fb27SDimitry Andric 205*06c3fb27SDimitry Andric if (isAggregateTypeForABI(Ty)) { 206*06c3fb27SDimitry Andric // Records with non-trivial destructors/copy-constructors should not be 207*06c3fb27SDimitry Andric // passed by value. 208*06c3fb27SDimitry Andric if (auto RAA = getRecordArgABI(Ty, getCXXABI())) 209*06c3fb27SDimitry Andric return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); 210*06c3fb27SDimitry Andric 211*06c3fb27SDimitry Andric // Ignore empty structs/unions. 212*06c3fb27SDimitry Andric if (isEmptyRecord(getContext(), Ty, true)) 213*06c3fb27SDimitry Andric return ABIArgInfo::getIgnore(); 214*06c3fb27SDimitry Andric 215*06c3fb27SDimitry Andric // Lower single-element structs to just pass a regular value. TODO: We 216*06c3fb27SDimitry Andric // could do reasonable-size multiple-element structs too, using getExpand(), 217*06c3fb27SDimitry Andric // though watch out for things like bitfields. 218*06c3fb27SDimitry Andric if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) 219*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); 220*06c3fb27SDimitry Andric 221*06c3fb27SDimitry Andric if (const RecordType *RT = Ty->getAs<RecordType>()) { 222*06c3fb27SDimitry Andric const RecordDecl *RD = RT->getDecl(); 223*06c3fb27SDimitry Andric if (RD->hasFlexibleArrayMember()) 224*06c3fb27SDimitry Andric return DefaultABIInfo::classifyArgumentType(Ty); 225*06c3fb27SDimitry Andric } 226*06c3fb27SDimitry Andric 227*06c3fb27SDimitry Andric // Pack aggregates <= 8 bytes into single VGPR or pair. 228*06c3fb27SDimitry Andric uint64_t Size = getContext().getTypeSize(Ty); 229*06c3fb27SDimitry Andric if (Size <= 64) { 230*06c3fb27SDimitry Andric unsigned NumRegs = (Size + 31) / 32; 231*06c3fb27SDimitry Andric NumRegsLeft -= std::min(NumRegsLeft, NumRegs); 232*06c3fb27SDimitry Andric 233*06c3fb27SDimitry Andric if (Size <= 16) 234*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); 235*06c3fb27SDimitry Andric 236*06c3fb27SDimitry Andric if (Size <= 32) 237*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); 238*06c3fb27SDimitry Andric 239*06c3fb27SDimitry Andric // XXX: Should this be i64 instead, and should the limit increase? 240*06c3fb27SDimitry Andric llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); 241*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); 242*06c3fb27SDimitry Andric } 243*06c3fb27SDimitry Andric 244*06c3fb27SDimitry Andric if (NumRegsLeft > 0) { 245*06c3fb27SDimitry Andric unsigned NumRegs = numRegsForType(Ty); 246*06c3fb27SDimitry Andric if (NumRegsLeft >= NumRegs) { 247*06c3fb27SDimitry Andric NumRegsLeft -= NumRegs; 248*06c3fb27SDimitry Andric return ABIArgInfo::getDirect(); 249*06c3fb27SDimitry Andric } 250*06c3fb27SDimitry Andric } 251*06c3fb27SDimitry Andric } 252*06c3fb27SDimitry Andric 253*06c3fb27SDimitry Andric // Otherwise just do the default thing. 254*06c3fb27SDimitry Andric ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty); 255*06c3fb27SDimitry Andric if (!ArgInfo.isIndirect()) { 256*06c3fb27SDimitry Andric unsigned NumRegs = numRegsForType(Ty); 257*06c3fb27SDimitry Andric NumRegsLeft -= std::min(NumRegs, NumRegsLeft); 258*06c3fb27SDimitry Andric } 259*06c3fb27SDimitry Andric 260*06c3fb27SDimitry Andric return ArgInfo; 261*06c3fb27SDimitry Andric } 262*06c3fb27SDimitry Andric 263*06c3fb27SDimitry Andric class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { 264*06c3fb27SDimitry Andric public: 265*06c3fb27SDimitry Andric AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) 266*06c3fb27SDimitry Andric : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {} 267*06c3fb27SDimitry Andric 268*06c3fb27SDimitry Andric void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, 269*06c3fb27SDimitry Andric CodeGenModule &CGM) const; 270*06c3fb27SDimitry Andric 271*06c3fb27SDimitry Andric void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, 272*06c3fb27SDimitry Andric CodeGen::CodeGenModule &M) const override; 273*06c3fb27SDimitry Andric unsigned getOpenCLKernelCallingConv() const override; 274*06c3fb27SDimitry Andric 275*06c3fb27SDimitry Andric llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, 276*06c3fb27SDimitry Andric llvm::PointerType *T, QualType QT) const override; 277*06c3fb27SDimitry Andric 278*06c3fb27SDimitry Andric LangAS getASTAllocaAddressSpace() const override { 279*06c3fb27SDimitry Andric return getLangASFromTargetAS( 280*06c3fb27SDimitry Andric getABIInfo().getDataLayout().getAllocaAddrSpace()); 281*06c3fb27SDimitry Andric } 282*06c3fb27SDimitry Andric LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, 283*06c3fb27SDimitry Andric const VarDecl *D) const override; 284*06c3fb27SDimitry Andric llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, 285*06c3fb27SDimitry Andric SyncScope Scope, 286*06c3fb27SDimitry Andric llvm::AtomicOrdering Ordering, 287*06c3fb27SDimitry Andric llvm::LLVMContext &Ctx) const override; 288*06c3fb27SDimitry Andric llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, 289*06c3fb27SDimitry Andric llvm::Function *BlockInvokeFunc, 290*06c3fb27SDimitry Andric llvm::Type *BlockTy) const override; 291*06c3fb27SDimitry Andric bool shouldEmitStaticExternCAliases() const override; 292*06c3fb27SDimitry Andric bool shouldEmitDWARFBitFieldSeparators() const override; 293*06c3fb27SDimitry Andric void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; 294*06c3fb27SDimitry Andric }; 295*06c3fb27SDimitry Andric } 296*06c3fb27SDimitry Andric 297*06c3fb27SDimitry Andric static bool requiresAMDGPUProtectedVisibility(const Decl *D, 298*06c3fb27SDimitry Andric llvm::GlobalValue *GV) { 299*06c3fb27SDimitry Andric if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) 300*06c3fb27SDimitry Andric return false; 301*06c3fb27SDimitry Andric 302*06c3fb27SDimitry Andric return D->hasAttr<OpenCLKernelAttr>() || 303*06c3fb27SDimitry Andric (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) || 304*06c3fb27SDimitry Andric (isa<VarDecl>(D) && 305*06c3fb27SDimitry Andric (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 306*06c3fb27SDimitry Andric cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() || 307*06c3fb27SDimitry Andric cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())); 308*06c3fb27SDimitry Andric } 309*06c3fb27SDimitry Andric 310*06c3fb27SDimitry Andric void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( 311*06c3fb27SDimitry Andric const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { 312*06c3fb27SDimitry Andric const auto *ReqdWGS = 313*06c3fb27SDimitry Andric M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; 314*06c3fb27SDimitry Andric const bool IsOpenCLKernel = 315*06c3fb27SDimitry Andric M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>(); 316*06c3fb27SDimitry Andric const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>(); 317*06c3fb27SDimitry Andric 318*06c3fb27SDimitry Andric const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>(); 319*06c3fb27SDimitry Andric if (ReqdWGS || FlatWGS) { 320*06c3fb27SDimitry Andric unsigned Min = 0; 321*06c3fb27SDimitry Andric unsigned Max = 0; 322*06c3fb27SDimitry Andric if (FlatWGS) { 323*06c3fb27SDimitry Andric Min = FlatWGS->getMin() 324*06c3fb27SDimitry Andric ->EvaluateKnownConstInt(M.getContext()) 325*06c3fb27SDimitry Andric .getExtValue(); 326*06c3fb27SDimitry Andric Max = FlatWGS->getMax() 327*06c3fb27SDimitry Andric ->EvaluateKnownConstInt(M.getContext()) 328*06c3fb27SDimitry Andric .getExtValue(); 329*06c3fb27SDimitry Andric } 330*06c3fb27SDimitry Andric if (ReqdWGS && Min == 0 && Max == 0) 331*06c3fb27SDimitry Andric Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim(); 332*06c3fb27SDimitry Andric 333*06c3fb27SDimitry Andric if (Min != 0) { 334*06c3fb27SDimitry Andric assert(Min <= Max && "Min must be less than or equal Max"); 335*06c3fb27SDimitry Andric 336*06c3fb27SDimitry Andric std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max); 337*06c3fb27SDimitry Andric F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 338*06c3fb27SDimitry Andric } else 339*06c3fb27SDimitry Andric assert(Max == 0 && "Max must be zero"); 340*06c3fb27SDimitry Andric } else if (IsOpenCLKernel || IsHIPKernel) { 341*06c3fb27SDimitry Andric // By default, restrict the maximum size to a value specified by 342*06c3fb27SDimitry Andric // --gpu-max-threads-per-block=n or its default value for HIP. 343*06c3fb27SDimitry Andric const unsigned OpenCLDefaultMaxWorkGroupSize = 256; 344*06c3fb27SDimitry Andric const unsigned DefaultMaxWorkGroupSize = 345*06c3fb27SDimitry Andric IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize 346*06c3fb27SDimitry Andric : M.getLangOpts().GPUMaxThreadsPerBlock; 347*06c3fb27SDimitry Andric std::string AttrVal = 348*06c3fb27SDimitry Andric std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); 349*06c3fb27SDimitry Andric F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); 350*06c3fb27SDimitry Andric } 351*06c3fb27SDimitry Andric 352*06c3fb27SDimitry Andric if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) { 353*06c3fb27SDimitry Andric unsigned Min = 354*06c3fb27SDimitry Andric Attr->getMin()->EvaluateKnownConstInt(M.getContext()).getExtValue(); 355*06c3fb27SDimitry Andric unsigned Max = Attr->getMax() ? Attr->getMax() 356*06c3fb27SDimitry Andric ->EvaluateKnownConstInt(M.getContext()) 357*06c3fb27SDimitry Andric .getExtValue() 358*06c3fb27SDimitry Andric : 0; 359*06c3fb27SDimitry Andric 360*06c3fb27SDimitry Andric if (Min != 0) { 361*06c3fb27SDimitry Andric assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max"); 362*06c3fb27SDimitry Andric 363*06c3fb27SDimitry Andric std::string AttrVal = llvm::utostr(Min); 364*06c3fb27SDimitry Andric if (Max != 0) 365*06c3fb27SDimitry Andric AttrVal = AttrVal + "," + llvm::utostr(Max); 366*06c3fb27SDimitry Andric F->addFnAttr("amdgpu-waves-per-eu", AttrVal); 367*06c3fb27SDimitry Andric } else 368*06c3fb27SDimitry Andric assert(Max == 0 && "Max must be zero"); 369*06c3fb27SDimitry Andric } 370*06c3fb27SDimitry Andric 371*06c3fb27SDimitry Andric if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) { 372*06c3fb27SDimitry Andric unsigned NumSGPR = Attr->getNumSGPR(); 373*06c3fb27SDimitry Andric 374*06c3fb27SDimitry Andric if (NumSGPR != 0) 375*06c3fb27SDimitry Andric F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR)); 376*06c3fb27SDimitry Andric } 377*06c3fb27SDimitry Andric 378*06c3fb27SDimitry Andric if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) { 379*06c3fb27SDimitry Andric uint32_t NumVGPR = Attr->getNumVGPR(); 380*06c3fb27SDimitry Andric 381*06c3fb27SDimitry Andric if (NumVGPR != 0) 382*06c3fb27SDimitry Andric F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); 383*06c3fb27SDimitry Andric } 384*06c3fb27SDimitry Andric } 385*06c3fb27SDimitry Andric 386*06c3fb27SDimitry Andric void AMDGPUTargetCodeGenInfo::setTargetAttributes( 387*06c3fb27SDimitry Andric const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { 388*06c3fb27SDimitry Andric if (requiresAMDGPUProtectedVisibility(D, GV)) { 389*06c3fb27SDimitry Andric GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); 390*06c3fb27SDimitry Andric GV->setDSOLocal(true); 391*06c3fb27SDimitry Andric } 392*06c3fb27SDimitry Andric 393*06c3fb27SDimitry Andric if (GV->isDeclaration()) 394*06c3fb27SDimitry Andric return; 395*06c3fb27SDimitry Andric 396*06c3fb27SDimitry Andric llvm::Function *F = dyn_cast<llvm::Function>(GV); 397*06c3fb27SDimitry Andric if (!F) 398*06c3fb27SDimitry Andric return; 399*06c3fb27SDimitry Andric 400*06c3fb27SDimitry Andric const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D); 401*06c3fb27SDimitry Andric if (FD) 402*06c3fb27SDimitry Andric setFunctionDeclAttributes(FD, F, M); 403*06c3fb27SDimitry Andric 404*06c3fb27SDimitry Andric const bool IsHIPKernel = 405*06c3fb27SDimitry Andric M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>(); 406*06c3fb27SDimitry Andric 407*06c3fb27SDimitry Andric // TODO: This should be moved to language specific attributes instead. 408*06c3fb27SDimitry Andric if (IsHIPKernel) 409*06c3fb27SDimitry Andric F->addFnAttr("uniform-work-group-size", "true"); 410*06c3fb27SDimitry Andric 411*06c3fb27SDimitry Andric if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) 412*06c3fb27SDimitry Andric F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); 413*06c3fb27SDimitry Andric 414*06c3fb27SDimitry Andric if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) 415*06c3fb27SDimitry Andric F->addFnAttr("amdgpu-ieee", "false"); 416*06c3fb27SDimitry Andric } 417*06c3fb27SDimitry Andric 418*06c3fb27SDimitry Andric unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const { 419*06c3fb27SDimitry Andric return llvm::CallingConv::AMDGPU_KERNEL; 420*06c3fb27SDimitry Andric } 421*06c3fb27SDimitry Andric 422*06c3fb27SDimitry Andric // Currently LLVM assumes null pointers always have value 0, 423*06c3fb27SDimitry Andric // which results in incorrectly transformed IR. Therefore, instead of 424*06c3fb27SDimitry Andric // emitting null pointers in private and local address spaces, a null 425*06c3fb27SDimitry Andric // pointer in generic address space is emitted which is casted to a 426*06c3fb27SDimitry Andric // pointer in local or private address space. 427*06c3fb27SDimitry Andric llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer( 428*06c3fb27SDimitry Andric const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, 429*06c3fb27SDimitry Andric QualType QT) const { 430*06c3fb27SDimitry Andric if (CGM.getContext().getTargetNullPointerValue(QT) == 0) 431*06c3fb27SDimitry Andric return llvm::ConstantPointerNull::get(PT); 432*06c3fb27SDimitry Andric 433*06c3fb27SDimitry Andric auto &Ctx = CGM.getContext(); 434*06c3fb27SDimitry Andric auto NPT = llvm::PointerType::get( 435*06c3fb27SDimitry Andric PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic)); 436*06c3fb27SDimitry Andric return llvm::ConstantExpr::getAddrSpaceCast( 437*06c3fb27SDimitry Andric llvm::ConstantPointerNull::get(NPT), PT); 438*06c3fb27SDimitry Andric } 439*06c3fb27SDimitry Andric 440*06c3fb27SDimitry Andric LangAS 441*06c3fb27SDimitry Andric AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, 442*06c3fb27SDimitry Andric const VarDecl *D) const { 443*06c3fb27SDimitry Andric assert(!CGM.getLangOpts().OpenCL && 444*06c3fb27SDimitry Andric !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) && 445*06c3fb27SDimitry Andric "Address space agnostic languages only"); 446*06c3fb27SDimitry Andric LangAS DefaultGlobalAS = getLangASFromTargetAS( 447*06c3fb27SDimitry Andric CGM.getContext().getTargetAddressSpace(LangAS::opencl_global)); 448*06c3fb27SDimitry Andric if (!D) 449*06c3fb27SDimitry Andric return DefaultGlobalAS; 450*06c3fb27SDimitry Andric 451*06c3fb27SDimitry Andric LangAS AddrSpace = D->getType().getAddressSpace(); 452*06c3fb27SDimitry Andric assert(AddrSpace == LangAS::Default || isTargetAddressSpace(AddrSpace)); 453*06c3fb27SDimitry Andric if (AddrSpace != LangAS::Default) 454*06c3fb27SDimitry Andric return AddrSpace; 455*06c3fb27SDimitry Andric 456*06c3fb27SDimitry Andric // Only promote to address space 4 if VarDecl has constant initialization. 457*06c3fb27SDimitry Andric if (CGM.isTypeConstant(D->getType(), false, false) && 458*06c3fb27SDimitry Andric D->hasConstantInitialization()) { 459*06c3fb27SDimitry Andric if (auto ConstAS = CGM.getTarget().getConstantAddressSpace()) 460*06c3fb27SDimitry Andric return *ConstAS; 461*06c3fb27SDimitry Andric } 462*06c3fb27SDimitry Andric return DefaultGlobalAS; 463*06c3fb27SDimitry Andric } 464*06c3fb27SDimitry Andric 465*06c3fb27SDimitry Andric llvm::SyncScope::ID 466*06c3fb27SDimitry Andric AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, 467*06c3fb27SDimitry Andric SyncScope Scope, 468*06c3fb27SDimitry Andric llvm::AtomicOrdering Ordering, 469*06c3fb27SDimitry Andric llvm::LLVMContext &Ctx) const { 470*06c3fb27SDimitry Andric std::string Name; 471*06c3fb27SDimitry Andric switch (Scope) { 472*06c3fb27SDimitry Andric case SyncScope::HIPSingleThread: 473*06c3fb27SDimitry Andric Name = "singlethread"; 474*06c3fb27SDimitry Andric break; 475*06c3fb27SDimitry Andric case SyncScope::HIPWavefront: 476*06c3fb27SDimitry Andric case SyncScope::OpenCLSubGroup: 477*06c3fb27SDimitry Andric Name = "wavefront"; 478*06c3fb27SDimitry Andric break; 479*06c3fb27SDimitry Andric case SyncScope::HIPWorkgroup: 480*06c3fb27SDimitry Andric case SyncScope::OpenCLWorkGroup: 481*06c3fb27SDimitry Andric Name = "workgroup"; 482*06c3fb27SDimitry Andric break; 483*06c3fb27SDimitry Andric case SyncScope::HIPAgent: 484*06c3fb27SDimitry Andric case SyncScope::OpenCLDevice: 485*06c3fb27SDimitry Andric Name = "agent"; 486*06c3fb27SDimitry Andric break; 487*06c3fb27SDimitry Andric case SyncScope::HIPSystem: 488*06c3fb27SDimitry Andric case SyncScope::OpenCLAllSVMDevices: 489*06c3fb27SDimitry Andric Name = ""; 490*06c3fb27SDimitry Andric break; 491*06c3fb27SDimitry Andric } 492*06c3fb27SDimitry Andric 493*06c3fb27SDimitry Andric if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) { 494*06c3fb27SDimitry Andric if (!Name.empty()) 495*06c3fb27SDimitry Andric Name = Twine(Twine(Name) + Twine("-")).str(); 496*06c3fb27SDimitry Andric 497*06c3fb27SDimitry Andric Name = Twine(Twine(Name) + Twine("one-as")).str(); 498*06c3fb27SDimitry Andric } 499*06c3fb27SDimitry Andric 500*06c3fb27SDimitry Andric return Ctx.getOrInsertSyncScopeID(Name); 501*06c3fb27SDimitry Andric } 502*06c3fb27SDimitry Andric 503*06c3fb27SDimitry Andric bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { 504*06c3fb27SDimitry Andric return false; 505*06c3fb27SDimitry Andric } 506*06c3fb27SDimitry Andric 507*06c3fb27SDimitry Andric bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const { 508*06c3fb27SDimitry Andric return true; 509*06c3fb27SDimitry Andric } 510*06c3fb27SDimitry Andric 511*06c3fb27SDimitry Andric void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( 512*06c3fb27SDimitry Andric const FunctionType *&FT) const { 513*06c3fb27SDimitry Andric FT = getABIInfo().getContext().adjustFunctionType( 514*06c3fb27SDimitry Andric FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); 515*06c3fb27SDimitry Andric } 516*06c3fb27SDimitry Andric 517*06c3fb27SDimitry Andric /// Create an OpenCL kernel for an enqueued block. 518*06c3fb27SDimitry Andric /// 519*06c3fb27SDimitry Andric /// The type of the first argument (the block literal) is the struct type 520*06c3fb27SDimitry Andric /// of the block literal instead of a pointer type. The first argument 521*06c3fb27SDimitry Andric /// (block literal) is passed directly by value to the kernel. The kernel 522*06c3fb27SDimitry Andric /// allocates the same type of struct on stack and stores the block literal 523*06c3fb27SDimitry Andric /// to it and passes its pointer to the block invoke function. The kernel 524*06c3fb27SDimitry Andric /// has "enqueued-block" function attribute and kernel argument metadata. 525*06c3fb27SDimitry Andric llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( 526*06c3fb27SDimitry Andric CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const { 527*06c3fb27SDimitry Andric auto &Builder = CGF.Builder; 528*06c3fb27SDimitry Andric auto &C = CGF.getLLVMContext(); 529*06c3fb27SDimitry Andric 530*06c3fb27SDimitry Andric auto *InvokeFT = Invoke->getFunctionType(); 531*06c3fb27SDimitry Andric llvm::SmallVector<llvm::Type *, 2> ArgTys; 532*06c3fb27SDimitry Andric llvm::SmallVector<llvm::Metadata *, 8> AddressQuals; 533*06c3fb27SDimitry Andric llvm::SmallVector<llvm::Metadata *, 8> AccessQuals; 534*06c3fb27SDimitry Andric llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames; 535*06c3fb27SDimitry Andric llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames; 536*06c3fb27SDimitry Andric llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals; 537*06c3fb27SDimitry Andric llvm::SmallVector<llvm::Metadata *, 8> ArgNames; 538*06c3fb27SDimitry Andric 539*06c3fb27SDimitry Andric ArgTys.push_back(BlockTy); 540*06c3fb27SDimitry Andric ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 541*06c3fb27SDimitry Andric AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0))); 542*06c3fb27SDimitry Andric ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); 543*06c3fb27SDimitry Andric ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 544*06c3fb27SDimitry Andric AccessQuals.push_back(llvm::MDString::get(C, "none")); 545*06c3fb27SDimitry Andric ArgNames.push_back(llvm::MDString::get(C, "block_literal")); 546*06c3fb27SDimitry Andric for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { 547*06c3fb27SDimitry Andric ArgTys.push_back(InvokeFT->getParamType(I)); 548*06c3fb27SDimitry Andric ArgTypeNames.push_back(llvm::MDString::get(C, "void*")); 549*06c3fb27SDimitry Andric AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3))); 550*06c3fb27SDimitry Andric AccessQuals.push_back(llvm::MDString::get(C, "none")); 551*06c3fb27SDimitry Andric ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*")); 552*06c3fb27SDimitry Andric ArgTypeQuals.push_back(llvm::MDString::get(C, "")); 553*06c3fb27SDimitry Andric ArgNames.push_back( 554*06c3fb27SDimitry Andric llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str())); 555*06c3fb27SDimitry Andric } 556*06c3fb27SDimitry Andric std::string Name = Invoke->getName().str() + "_kernel"; 557*06c3fb27SDimitry Andric auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); 558*06c3fb27SDimitry Andric auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, 559*06c3fb27SDimitry Andric &CGF.CGM.getModule()); 560*06c3fb27SDimitry Andric F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); 561*06c3fb27SDimitry Andric 562*06c3fb27SDimitry Andric llvm::AttrBuilder KernelAttrs(C); 563*06c3fb27SDimitry Andric // FIXME: The invoke isn't applying the right attributes either 564*06c3fb27SDimitry Andric // FIXME: This is missing setTargetAttributes 565*06c3fb27SDimitry Andric CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs); 566*06c3fb27SDimitry Andric KernelAttrs.addAttribute("enqueued-block"); 567*06c3fb27SDimitry Andric F->addFnAttrs(KernelAttrs); 568*06c3fb27SDimitry Andric 569*06c3fb27SDimitry Andric auto IP = CGF.Builder.saveIP(); 570*06c3fb27SDimitry Andric auto *BB = llvm::BasicBlock::Create(C, "entry", F); 571*06c3fb27SDimitry Andric Builder.SetInsertPoint(BB); 572*06c3fb27SDimitry Andric const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy); 573*06c3fb27SDimitry Andric auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); 574*06c3fb27SDimitry Andric BlockPtr->setAlignment(BlockAlign); 575*06c3fb27SDimitry Andric Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign); 576*06c3fb27SDimitry Andric auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0)); 577*06c3fb27SDimitry Andric llvm::SmallVector<llvm::Value *, 2> Args; 578*06c3fb27SDimitry Andric Args.push_back(Cast); 579*06c3fb27SDimitry Andric for (llvm::Argument &A : llvm::drop_begin(F->args())) 580*06c3fb27SDimitry Andric Args.push_back(&A); 581*06c3fb27SDimitry Andric llvm::CallInst *call = Builder.CreateCall(Invoke, Args); 582*06c3fb27SDimitry Andric call->setCallingConv(Invoke->getCallingConv()); 583*06c3fb27SDimitry Andric Builder.CreateRetVoid(); 584*06c3fb27SDimitry Andric Builder.restoreIP(IP); 585*06c3fb27SDimitry Andric 586*06c3fb27SDimitry Andric F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals)); 587*06c3fb27SDimitry Andric F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals)); 588*06c3fb27SDimitry Andric F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames)); 589*06c3fb27SDimitry Andric F->setMetadata("kernel_arg_base_type", 590*06c3fb27SDimitry Andric llvm::MDNode::get(C, ArgBaseTypeNames)); 591*06c3fb27SDimitry Andric F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals)); 592*06c3fb27SDimitry Andric if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) 593*06c3fb27SDimitry Andric F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames)); 594*06c3fb27SDimitry Andric 595*06c3fb27SDimitry Andric return F; 596*06c3fb27SDimitry Andric } 597*06c3fb27SDimitry Andric 598*06c3fb27SDimitry Andric std::unique_ptr<TargetCodeGenInfo> 599*06c3fb27SDimitry Andric CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) { 600*06c3fb27SDimitry Andric return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes()); 601*06c3fb27SDimitry Andric } 602