1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/Analysis/LoopInfo.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Support/KnownBits.h"
26 #include <optional>
27
28 using namespace llvm;
29
30 #define DEBUG_TYPE "AMDGPUtti"
31
32 static cl::opt<unsigned> UnrollThresholdPrivate(
33 "amdgpu-unroll-threshold-private",
34 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
35 cl::init(2700), cl::Hidden);
36
37 static cl::opt<unsigned> UnrollThresholdLocal(
38 "amdgpu-unroll-threshold-local",
39 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
40 cl::init(1000), cl::Hidden);
41
42 static cl::opt<unsigned> UnrollThresholdIf(
43 "amdgpu-unroll-threshold-if",
44 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
45 cl::init(200), cl::Hidden);
46
47 static cl::opt<bool> UnrollRuntimeLocal(
48 "amdgpu-unroll-runtime-local",
49 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
50 cl::init(true), cl::Hidden);
51
52 static cl::opt<bool> UseLegacyDA(
53 "amdgpu-use-legacy-divergence-analysis",
54 cl::desc("Enable legacy divergence analysis for AMDGPU"),
55 cl::init(false), cl::Hidden);
56
57 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
58 "amdgpu-unroll-max-block-to-analyze",
59 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
60 cl::init(32), cl::Hidden);
61
62 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
63 cl::Hidden, cl::init(4000),
64 cl::desc("Cost of alloca argument"));
65
66 // If the amount of scratch memory to eliminate exceeds our ability to allocate
67 // it into registers we gain nothing by aggressively inlining functions for that
68 // heuristic.
69 static cl::opt<unsigned>
70 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
71 cl::init(256),
72 cl::desc("Maximum alloca size to use for inline cost"));
73
74 // Inliner constraint to achieve reasonable compilation time.
75 static cl::opt<size_t> InlineMaxBB(
76 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
77 cl::desc("Maximum number of BBs allowed in a function after inlining"
78 " (compile time constraint)"));
79
dependsOnLocalPhi(const Loop * L,const Value * Cond,unsigned Depth=0)80 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
81 unsigned Depth = 0) {
82 const Instruction *I = dyn_cast<Instruction>(Cond);
83 if (!I)
84 return false;
85
86 for (const Value *V : I->operand_values()) {
87 if (!L->contains(I))
88 continue;
89 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
90 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
91 return SubLoop->contains(PHI); }))
92 return true;
93 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
94 return true;
95 }
96 return false;
97 }
98
AMDGPUTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)99 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
100 : BaseT(TM, F.getParent()->getDataLayout()),
101 TargetTriple(TM->getTargetTriple()),
102 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
103 TLI(ST->getTargetLowering()) {}
104
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)105 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
106 TTI::UnrollingPreferences &UP,
107 OptimizationRemarkEmitter *ORE) {
108 const Function &F = *L->getHeader()->getParent();
109 UP.Threshold =
110 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
111 UP.MaxCount = std::numeric_limits<unsigned>::max();
112 UP.Partial = true;
113
114 // Conditional branch in a loop back edge needs 3 additional exec
115 // manipulations in average.
116 UP.BEInsns += 3;
117
118 // TODO: Do we want runtime unrolling?
119
120 // Maximum alloca size than can fit registers. Reserve 16 registers.
121 const unsigned MaxAlloca = (256 - 16) * 4;
122 unsigned ThresholdPrivate = UnrollThresholdPrivate;
123 unsigned ThresholdLocal = UnrollThresholdLocal;
124
125 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
126 // provided threshold value as the default for Threshold
127 if (MDNode *LoopUnrollThreshold =
128 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
129 if (LoopUnrollThreshold->getNumOperands() == 2) {
130 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
131 LoopUnrollThreshold->getOperand(1));
132 if (MetaThresholdValue) {
133 // We will also use the supplied value for PartialThreshold for now.
134 // We may introduce additional metadata if it becomes necessary in the
135 // future.
136 UP.Threshold = MetaThresholdValue->getSExtValue();
137 UP.PartialThreshold = UP.Threshold;
138 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
139 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
140 }
141 }
142 }
143
144 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
145 for (const BasicBlock *BB : L->getBlocks()) {
146 const DataLayout &DL = BB->getModule()->getDataLayout();
147 unsigned LocalGEPsSeen = 0;
148
149 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
150 return SubLoop->contains(BB); }))
151 continue; // Block belongs to an inner loop.
152
153 for (const Instruction &I : *BB) {
154 // Unroll a loop which contains an "if" statement whose condition
155 // defined by a PHI belonging to the loop. This may help to eliminate
156 // if region and potentially even PHI itself, saving on both divergence
157 // and registers used for the PHI.
158 // Add a small bonus for each of such "if" statements.
159 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
160 if (UP.Threshold < MaxBoost && Br->isConditional()) {
161 BasicBlock *Succ0 = Br->getSuccessor(0);
162 BasicBlock *Succ1 = Br->getSuccessor(1);
163 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
164 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
165 continue;
166 if (dependsOnLocalPhi(L, Br->getCondition())) {
167 UP.Threshold += UnrollThresholdIf;
168 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
169 << " for loop:\n"
170 << *L << " due to " << *Br << '\n');
171 if (UP.Threshold >= MaxBoost)
172 return;
173 }
174 }
175 continue;
176 }
177
178 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
179 if (!GEP)
180 continue;
181
182 unsigned AS = GEP->getAddressSpace();
183 unsigned Threshold = 0;
184 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
185 Threshold = ThresholdPrivate;
186 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
187 Threshold = ThresholdLocal;
188 else
189 continue;
190
191 if (UP.Threshold >= Threshold)
192 continue;
193
194 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
195 const Value *Ptr = GEP->getPointerOperand();
196 const AllocaInst *Alloca =
197 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
198 if (!Alloca || !Alloca->isStaticAlloca())
199 continue;
200 Type *Ty = Alloca->getAllocatedType();
201 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
202 if (AllocaSize > MaxAlloca)
203 continue;
204 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
205 AS == AMDGPUAS::REGION_ADDRESS) {
206 LocalGEPsSeen++;
207 // Inhibit unroll for local memory if we have seen addressing not to
208 // a variable, most likely we will be unable to combine it.
209 // Do not unroll too deep inner loops for local memory to give a chance
210 // to unroll an outer loop for a more important reason.
211 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
212 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
213 !isa<Argument>(GEP->getPointerOperand())))
214 continue;
215 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
216 << *L << " due to LDS use.\n");
217 UP.Runtime = UnrollRuntimeLocal;
218 }
219
220 // Check if GEP depends on a value defined by this loop itself.
221 bool HasLoopDef = false;
222 for (const Value *Op : GEP->operands()) {
223 const Instruction *Inst = dyn_cast<Instruction>(Op);
224 if (!Inst || L->isLoopInvariant(Op))
225 continue;
226
227 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
228 return SubLoop->contains(Inst); }))
229 continue;
230 HasLoopDef = true;
231 break;
232 }
233 if (!HasLoopDef)
234 continue;
235
236 // We want to do whatever we can to limit the number of alloca
237 // instructions that make it through to the code generator. allocas
238 // require us to use indirect addressing, which is slow and prone to
239 // compiler bugs. If this loop does an address calculation on an
240 // alloca ptr, then we want to use a higher than normal loop unroll
241 // threshold. This will give SROA a better chance to eliminate these
242 // allocas.
243 //
244 // We also want to have more unrolling for local memory to let ds
245 // instructions with different offsets combine.
246 //
247 // Don't use the maximum allowed value here as it will make some
248 // programs way too big.
249 UP.Threshold = Threshold;
250 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
251 << " for loop:\n"
252 << *L << " due to " << *GEP << '\n');
253 if (UP.Threshold >= MaxBoost)
254 return;
255 }
256
257 // If we got a GEP in a small BB from inner loop then increase max trip
258 // count to analyze for better estimation cost in unroll
259 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
260 UP.MaxIterationsCountToAnalyze = 32;
261 }
262 }
263
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)264 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
265 TTI::PeelingPreferences &PP) {
266 BaseT::getPeelingPreferences(L, SE, PP);
267 }
268
269 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
270 // Codegen control options which don't matter.
271 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
272 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
273 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
274 AMDGPU::FeatureUnalignedAccessMode,
275
276 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
277
278 // Property of the kernel/environment which can't actually differ.
279 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
280 AMDGPU::FeatureTrapHandler,
281
282 // The default assumption needs to be ecc is enabled, but no directly
283 // exposed operations depend on it, so it can be safely inlined.
284 AMDGPU::FeatureSRAMECC,
285
286 // Perf-tuning features
287 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
288
GCNTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)289 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
290 : BaseT(TM, F.getParent()->getDataLayout()),
291 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
292 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
293 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
294 AMDGPU::SIModeRegisterDefaults Mode(F);
295 HasFP32Denormals = Mode.allFP32Denormals();
296 HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
297 }
298
getNumberOfRegisters(unsigned RCID) const299 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
300 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
301 // registers. See getRegisterClassForType for the implementation.
302 // In this case vector registers are not vector in terms of
303 // VGPRs, but those which can hold multiple values.
304
305 // This is really the number of registers to fill when vectorizing /
306 // interleaving loops, so we lie to avoid trying to use all registers.
307 return 4;
308 }
309
310 TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const311 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
312 switch (K) {
313 case TargetTransformInfo::RGK_Scalar:
314 return TypeSize::getFixed(32);
315 case TargetTransformInfo::RGK_FixedWidthVector:
316 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
317 case TargetTransformInfo::RGK_ScalableVector:
318 return TypeSize::getScalable(0);
319 }
320 llvm_unreachable("Unsupported register kind");
321 }
322
getMinVectorRegisterBitWidth() const323 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
324 return 32;
325 }
326
getMaximumVF(unsigned ElemWidth,unsigned Opcode) const327 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
328 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
329 return 32 * 4 / ElemWidth;
330 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
331 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
332 : 1;
333 }
334
getLoadVectorFactor(unsigned VF,unsigned LoadSize,unsigned ChainSizeInBytes,VectorType * VecTy) const335 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
336 unsigned ChainSizeInBytes,
337 VectorType *VecTy) const {
338 unsigned VecRegBitWidth = VF * LoadSize;
339 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
340 // TODO: Support element-size less than 32bit?
341 return 128 / LoadSize;
342
343 return VF;
344 }
345
getStoreVectorFactor(unsigned VF,unsigned StoreSize,unsigned ChainSizeInBytes,VectorType * VecTy) const346 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
347 unsigned ChainSizeInBytes,
348 VectorType *VecTy) const {
349 unsigned VecRegBitWidth = VF * StoreSize;
350 if (VecRegBitWidth > 128)
351 return 128 / StoreSize;
352
353 return VF;
354 }
355
getLoadStoreVecRegBitWidth(unsigned AddrSpace) const356 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
357 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
358 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
359 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
360 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
361 return 512;
362 }
363
364 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
365 return 8 * ST->getMaxPrivateElementSize();
366
367 // Common to flat, global, local and region. Assume for unknown addrspace.
368 return 128;
369 }
370
isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const371 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
372 Align Alignment,
373 unsigned AddrSpace) const {
374 // We allow vectorization of flat stores, even though we may need to decompose
375 // them later if they may access private memory. We don't have enough context
376 // here, and legalization can handle it.
377 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
378 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
379 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
380 }
381 return true;
382 }
383
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const384 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
385 Align Alignment,
386 unsigned AddrSpace) const {
387 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
388 }
389
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const390 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
391 Align Alignment,
392 unsigned AddrSpace) const {
393 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
394 }
395
396 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
397 // iteration. Should we report a larger size and let it legalize?
398 //
399 // FIXME: Should we use narrower types for local/region, or account for when
400 // unaligned access is legal?
401 //
402 // FIXME: This could use fine tuning and microbenchmarks.
getMemcpyLoopLoweringType(LLVMContext & Context,Value * Length,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign,std::optional<uint32_t> AtomicElementSize) const403 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
404 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
405 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
406 std::optional<uint32_t> AtomicElementSize) const {
407
408 if (AtomicElementSize)
409 return Type::getIntNTy(Context, *AtomicElementSize * 8);
410
411 unsigned MinAlign = std::min(SrcAlign, DestAlign);
412
413 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
414 // hardware into byte accesses. If you assume all alignments are equally
415 // probable, it's more efficient on average to use short accesses for this
416 // case.
417 if (MinAlign == 2)
418 return Type::getInt16Ty(Context);
419
420 // Not all subtargets have 128-bit DS instructions, and we currently don't
421 // form them by default.
422 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
423 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
424 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
425 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
426 return FixedVectorType::get(Type::getInt32Ty(Context), 2);
427 }
428
429 // Global memory works best with 16-byte accesses. Private memory will also
430 // hit this, although they'll be decomposed.
431 return FixedVectorType::get(Type::getInt32Ty(Context), 4);
432 }
433
getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type * > & OpsOut,LLVMContext & Context,unsigned RemainingBytes,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign,std::optional<uint32_t> AtomicCpySize) const434 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
435 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
436 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
437 unsigned SrcAlign, unsigned DestAlign,
438 std::optional<uint32_t> AtomicCpySize) const {
439 assert(RemainingBytes < 16);
440
441 if (AtomicCpySize)
442 BaseT::getMemcpyLoopResidualLoweringType(
443 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
444 DestAlign, AtomicCpySize);
445
446 unsigned MinAlign = std::min(SrcAlign, DestAlign);
447
448 if (MinAlign != 2) {
449 Type *I64Ty = Type::getInt64Ty(Context);
450 while (RemainingBytes >= 8) {
451 OpsOut.push_back(I64Ty);
452 RemainingBytes -= 8;
453 }
454
455 Type *I32Ty = Type::getInt32Ty(Context);
456 while (RemainingBytes >= 4) {
457 OpsOut.push_back(I32Ty);
458 RemainingBytes -= 4;
459 }
460 }
461
462 Type *I16Ty = Type::getInt16Ty(Context);
463 while (RemainingBytes >= 2) {
464 OpsOut.push_back(I16Ty);
465 RemainingBytes -= 2;
466 }
467
468 Type *I8Ty = Type::getInt8Ty(Context);
469 while (RemainingBytes) {
470 OpsOut.push_back(I8Ty);
471 --RemainingBytes;
472 }
473 }
474
getMaxInterleaveFactor(unsigned VF)475 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
476 // Disable unrolling if the loop is not vectorized.
477 // TODO: Enable this again.
478 if (VF == 1)
479 return 1;
480
481 return 8;
482 }
483
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info) const484 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
485 MemIntrinsicInfo &Info) const {
486 switch (Inst->getIntrinsicID()) {
487 case Intrinsic::amdgcn_atomic_inc:
488 case Intrinsic::amdgcn_atomic_dec:
489 case Intrinsic::amdgcn_ds_ordered_add:
490 case Intrinsic::amdgcn_ds_ordered_swap:
491 case Intrinsic::amdgcn_ds_fadd:
492 case Intrinsic::amdgcn_ds_fmin:
493 case Intrinsic::amdgcn_ds_fmax: {
494 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
495 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
496 if (!Ordering || !Volatile)
497 return false; // Invalid.
498
499 unsigned OrderingVal = Ordering->getZExtValue();
500 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
501 return false;
502
503 Info.PtrVal = Inst->getArgOperand(0);
504 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
505 Info.ReadMem = true;
506 Info.WriteMem = true;
507 Info.IsVolatile = !Volatile->isZero();
508 return true;
509 }
510 default:
511 return false;
512 }
513 }
514
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)515 InstructionCost GCNTTIImpl::getArithmeticInstrCost(
516 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
517 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
518 ArrayRef<const Value *> Args,
519 const Instruction *CxtI) {
520
521 // Legalize the type.
522 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
523 int ISD = TLI->InstructionOpcodeToISD(Opcode);
524
525 // Because we don't have any legal vector operations, but the legal types, we
526 // need to account for split vectors.
527 unsigned NElts = LT.second.isVector() ?
528 LT.second.getVectorNumElements() : 1;
529
530 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
531
532 switch (ISD) {
533 case ISD::SHL:
534 case ISD::SRL:
535 case ISD::SRA:
536 if (SLT == MVT::i64)
537 return get64BitInstrCost(CostKind) * LT.first * NElts;
538
539 if (ST->has16BitInsts() && SLT == MVT::i16)
540 NElts = (NElts + 1) / 2;
541
542 // i32
543 return getFullRateInstrCost() * LT.first * NElts;
544 case ISD::ADD:
545 case ISD::SUB:
546 case ISD::AND:
547 case ISD::OR:
548 case ISD::XOR:
549 if (SLT == MVT::i64) {
550 // and, or and xor are typically split into 2 VALU instructions.
551 return 2 * getFullRateInstrCost() * LT.first * NElts;
552 }
553
554 if (ST->has16BitInsts() && SLT == MVT::i16)
555 NElts = (NElts + 1) / 2;
556
557 return LT.first * NElts * getFullRateInstrCost();
558 case ISD::MUL: {
559 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
560 if (SLT == MVT::i64) {
561 const int FullRateCost = getFullRateInstrCost();
562 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
563 }
564
565 if (ST->has16BitInsts() && SLT == MVT::i16)
566 NElts = (NElts + 1) / 2;
567
568 // i32
569 return QuarterRateCost * NElts * LT.first;
570 }
571 case ISD::FMUL:
572 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
573 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
574 // fused operation.
575 if (CxtI && CxtI->hasOneUse())
576 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
577 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
578 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
579 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
580 return TargetTransformInfo::TCC_Free;
581 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
582 return TargetTransformInfo::TCC_Free;
583
584 // Estimate all types may be fused with contract/unsafe flags
585 const TargetOptions &Options = TLI->getTargetMachine().Options;
586 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
587 Options.UnsafeFPMath ||
588 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
589 return TargetTransformInfo::TCC_Free;
590 }
591 }
592 [[fallthrough]];
593 case ISD::FADD:
594 case ISD::FSUB:
595 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
596 NElts = (NElts + 1) / 2;
597 if (SLT == MVT::f64)
598 return LT.first * NElts * get64BitInstrCost(CostKind);
599
600 if (ST->has16BitInsts() && SLT == MVT::f16)
601 NElts = (NElts + 1) / 2;
602
603 if (SLT == MVT::f32 || SLT == MVT::f16)
604 return LT.first * NElts * getFullRateInstrCost();
605 break;
606 case ISD::FDIV:
607 case ISD::FREM:
608 // FIXME: frem should be handled separately. The fdiv in it is most of it,
609 // but the current lowering is also not entirely correct.
610 if (SLT == MVT::f64) {
611 int Cost = 7 * get64BitInstrCost(CostKind) +
612 getQuarterRateInstrCost(CostKind) +
613 3 * getHalfRateInstrCost(CostKind);
614 // Add cost of workaround.
615 if (!ST->hasUsableDivScaleConditionOutput())
616 Cost += 3 * getFullRateInstrCost();
617
618 return LT.first * Cost * NElts;
619 }
620
621 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
622 // TODO: This is more complicated, unsafe flags etc.
623 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
624 (SLT == MVT::f16 && ST->has16BitInsts())) {
625 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
626 }
627 }
628
629 if (SLT == MVT::f16 && ST->has16BitInsts()) {
630 // 2 x v_cvt_f32_f16
631 // f32 rcp
632 // f32 fmul
633 // v_cvt_f16_f32
634 // f16 div_fixup
635 int Cost =
636 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
637 return LT.first * Cost * NElts;
638 }
639
640 if (SLT == MVT::f32 || SLT == MVT::f16) {
641 // 4 more v_cvt_* insts without f16 insts support
642 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
643 1 * getQuarterRateInstrCost(CostKind);
644
645 if (!HasFP32Denormals) {
646 // FP mode switches.
647 Cost += 2 * getFullRateInstrCost();
648 }
649
650 return LT.first * NElts * Cost;
651 }
652 break;
653 case ISD::FNEG:
654 // Use the backend' estimation. If fneg is not free each element will cost
655 // one additional instruction.
656 return TLI->isFNegFree(SLT) ? 0 : NElts;
657 default:
658 break;
659 }
660
661 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
662 Args, CxtI);
663 }
664
665 // Return true if there's a potential benefit from using v2f16/v2i16
666 // instructions for an intrinsic, even if it requires nontrivial legalization.
intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)667 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
668 switch (ID) {
669 case Intrinsic::fma: // TODO: fmuladd
670 // There's a small benefit to using vector ops in the legalized code.
671 case Intrinsic::round:
672 case Intrinsic::uadd_sat:
673 case Intrinsic::usub_sat:
674 case Intrinsic::sadd_sat:
675 case Intrinsic::ssub_sat:
676 return true;
677 default:
678 return false;
679 }
680 }
681
682 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)683 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
684 TTI::TargetCostKind CostKind) {
685 if (ICA.getID() == Intrinsic::fabs)
686 return 0;
687
688 if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
689 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
690
691 Type *RetTy = ICA.getReturnType();
692
693 // Legalize the type.
694 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
695
696 unsigned NElts = LT.second.isVector() ?
697 LT.second.getVectorNumElements() : 1;
698
699 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
700
701 if (SLT == MVT::f64)
702 return LT.first * NElts * get64BitInstrCost(CostKind);
703
704 if ((ST->has16BitInsts() && SLT == MVT::f16) ||
705 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
706 NElts = (NElts + 1) / 2;
707
708 // TODO: Get more refined intrinsic costs?
709 unsigned InstRate = getQuarterRateInstrCost(CostKind);
710
711 switch (ICA.getID()) {
712 case Intrinsic::fma:
713 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
714 : getQuarterRateInstrCost(CostKind);
715 break;
716 case Intrinsic::uadd_sat:
717 case Intrinsic::usub_sat:
718 case Intrinsic::sadd_sat:
719 case Intrinsic::ssub_sat:
720 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
721 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
722 NElts = 1;
723 break;
724 }
725
726 return LT.first * NElts * InstRate;
727 }
728
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)729 InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
730 TTI::TargetCostKind CostKind,
731 const Instruction *I) {
732 assert((I == nullptr || I->getOpcode() == Opcode) &&
733 "Opcode should reflect passed instruction.");
734 const bool SCost =
735 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
736 const int CBrCost = SCost ? 5 : 7;
737 switch (Opcode) {
738 case Instruction::Br: {
739 // Branch instruction takes about 4 slots on gfx900.
740 auto BI = dyn_cast_or_null<BranchInst>(I);
741 if (BI && BI->isUnconditional())
742 return SCost ? 1 : 4;
743 // Suppose conditional branch takes additional 3 exec manipulations
744 // instructions in average.
745 return CBrCost;
746 }
747 case Instruction::Switch: {
748 auto SI = dyn_cast_or_null<SwitchInst>(I);
749 // Each case (including default) takes 1 cmp + 1 cbr instructions in
750 // average.
751 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
752 }
753 case Instruction::Ret:
754 return SCost ? 1 : 10;
755 }
756 return BaseT::getCFInstrCost(Opcode, CostKind, I);
757 }
758
759 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)760 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
761 std::optional<FastMathFlags> FMF,
762 TTI::TargetCostKind CostKind) {
763 if (TTI::requiresOrderedReduction(FMF))
764 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
765
766 EVT OrigTy = TLI->getValueType(DL, Ty);
767
768 // Computes cost on targets that have packed math instructions(which support
769 // 16-bit types only).
770 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
771 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
772
773 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
774 return LT.first * getFullRateInstrCost();
775 }
776
777 InstructionCost
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsUnsigned,TTI::TargetCostKind CostKind)778 GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
779 bool IsUnsigned,
780 TTI::TargetCostKind CostKind) {
781 EVT OrigTy = TLI->getValueType(DL, Ty);
782
783 // Computes cost on targets that have packed math instructions(which support
784 // 16-bit types only).
785 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
786 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
787
788 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
789 return LT.first * getHalfRateInstrCost(CostKind);
790 }
791
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)792 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
793 TTI::TargetCostKind CostKind,
794 unsigned Index, Value *Op0,
795 Value *Op1) {
796 switch (Opcode) {
797 case Instruction::ExtractElement:
798 case Instruction::InsertElement: {
799 unsigned EltSize
800 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
801 if (EltSize < 32) {
802 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
803 return 0;
804 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
805 Op1);
806 }
807
808 // Extracts are just reads of a subregister, so are free. Inserts are
809 // considered free because we don't want to have any cost for scalarizing
810 // operations, and we don't have to copy into a different register class.
811
812 // Dynamic indexing isn't free and is best avoided.
813 return Index == ~0u ? 2 : 0;
814 }
815 default:
816 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
817 }
818 }
819
820 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
821 /// this is analyzing the collective result of all output registers. Otherwise,
822 /// this is only querying a specific result index if this returns multiple
823 /// registers in a struct.
isInlineAsmSourceOfDivergence(const CallInst * CI,ArrayRef<unsigned> Indices) const824 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
825 const CallInst *CI, ArrayRef<unsigned> Indices) const {
826 // TODO: Handle complex extract indices
827 if (Indices.size() > 1)
828 return true;
829
830 const DataLayout &DL = CI->getModule()->getDataLayout();
831 const SIRegisterInfo *TRI = ST->getRegisterInfo();
832 TargetLowering::AsmOperandInfoVector TargetConstraints =
833 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
834
835 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
836
837 int OutputIdx = 0;
838 for (auto &TC : TargetConstraints) {
839 if (TC.Type != InlineAsm::isOutput)
840 continue;
841
842 // Skip outputs we don't care about.
843 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
844 continue;
845
846 TLI->ComputeConstraintToUse(TC, SDValue());
847
848 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
849 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
850
851 // For AGPR constraints null is returned on subtargets without AGPRs, so
852 // assume divergent for null.
853 if (!RC || !TRI->isSGPRClass(RC))
854 return true;
855 }
856
857 return false;
858 }
859
860 /// \returns true if the new GPU divergence analysis is enabled.
useGPUDivergenceAnalysis() const861 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
862 return !UseLegacyDA;
863 }
864
isReadRegisterSourceOfDivergence(const IntrinsicInst * ReadReg) const865 bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
866 const IntrinsicInst *ReadReg) const {
867 Metadata *MD =
868 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
869 StringRef RegName =
870 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
871
872 // Special case registers that look like VCC.
873 MVT VT = MVT::getVT(ReadReg->getType());
874 if (VT == MVT::i1)
875 return true;
876
877 // Special case scalar registers that start with 'v'.
878 if (RegName.startswith("vcc") || RegName.empty())
879 return false;
880
881 // VGPR or AGPR is divergent. There aren't any specially named vector
882 // registers.
883 return RegName[0] == 'v' || RegName[0] == 'a';
884 }
885
886 /// \returns true if the result of the value could potentially be
887 /// different across workitems in a wavefront.
isSourceOfDivergence(const Value * V) const888 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
889 if (const Argument *A = dyn_cast<Argument>(V))
890 return !AMDGPU::isArgPassedInSGPR(A);
891
892 // Loads from the private and flat address spaces are divergent, because
893 // threads can execute the load instruction with the same inputs and get
894 // different results.
895 //
896 // All other loads are not divergent, because if threads issue loads with the
897 // same arguments, they will always get the same result.
898 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
899 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
900 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
901
902 // Atomics are divergent because they are executed sequentially: when an
903 // atomic operation refers to the same address in each thread, then each
904 // thread after the first sees the value written by the previous thread as
905 // original value.
906 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
907 return true;
908
909 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
910 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
911 return isReadRegisterSourceOfDivergence(Intrinsic);
912
913 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
914 }
915
916 // Assume all function calls are a source of divergence.
917 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
918 if (CI->isInlineAsm())
919 return isInlineAsmSourceOfDivergence(CI);
920 return true;
921 }
922
923 // Assume all function calls are a source of divergence.
924 if (isa<InvokeInst>(V))
925 return true;
926
927 return false;
928 }
929
isAlwaysUniform(const Value * V) const930 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
931 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
932 switch (Intrinsic->getIntrinsicID()) {
933 default:
934 return false;
935 case Intrinsic::amdgcn_readfirstlane:
936 case Intrinsic::amdgcn_readlane:
937 case Intrinsic::amdgcn_icmp:
938 case Intrinsic::amdgcn_fcmp:
939 case Intrinsic::amdgcn_ballot:
940 case Intrinsic::amdgcn_if_break:
941 return true;
942 }
943 }
944
945 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
946 if (CI->isInlineAsm())
947 return !isInlineAsmSourceOfDivergence(CI);
948 return false;
949 }
950
951 // In most cases TID / wavefrontsize is uniform.
952 //
953 // However, if a kernel has uneven dimesions we can have a value of
954 // workitem-id-x divided by the wavefrontsize non-uniform. For example
955 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
956 // packed into a same wave which gives 1 and 0 after the division by 64
957 // respectively.
958 //
959 // FIXME: limit it to 1D kernels only, although that shall be possible
960 // to perform this optimization is the size of the X dimension is a power
961 // of 2, we just do not currently have infrastructure to query it.
962 using namespace llvm::PatternMatch;
963 uint64_t C;
964 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
965 m_ConstantInt(C))) ||
966 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
967 m_ConstantInt(C)))) {
968 const Function *F = cast<Instruction>(V)->getFunction();
969 return C >= ST->getWavefrontSizeLog2() &&
970 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
971 }
972
973 Value *Mask;
974 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
975 m_Value(Mask)))) {
976 const Function *F = cast<Instruction>(V)->getFunction();
977 const DataLayout &DL = F->getParent()->getDataLayout();
978 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
979 ST->getWavefrontSizeLog2() &&
980 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
981 }
982
983 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
984 if (!ExtValue)
985 return false;
986
987 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
988 if (!CI)
989 return false;
990
991 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
992 switch (Intrinsic->getIntrinsicID()) {
993 default:
994 return false;
995 case Intrinsic::amdgcn_if:
996 case Intrinsic::amdgcn_else: {
997 ArrayRef<unsigned> Indices = ExtValue->getIndices();
998 return Indices.size() == 1 && Indices[0] == 1;
999 }
1000 }
1001 }
1002
1003 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1004 // divergent for the overall struct return. We need to override it in the
1005 // case we're extracting an SGPR component here.
1006 if (CI->isInlineAsm())
1007 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1008
1009 return false;
1010 }
1011
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID) const1012 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1013 Intrinsic::ID IID) const {
1014 switch (IID) {
1015 case Intrinsic::amdgcn_atomic_inc:
1016 case Intrinsic::amdgcn_atomic_dec:
1017 case Intrinsic::amdgcn_ds_fadd:
1018 case Intrinsic::amdgcn_ds_fmin:
1019 case Intrinsic::amdgcn_ds_fmax:
1020 case Intrinsic::amdgcn_is_shared:
1021 case Intrinsic::amdgcn_is_private:
1022 case Intrinsic::amdgcn_flat_atomic_fadd:
1023 case Intrinsic::amdgcn_flat_atomic_fmax:
1024 case Intrinsic::amdgcn_flat_atomic_fmin:
1025 OpIndexes.push_back(0);
1026 return true;
1027 default:
1028 return false;
1029 }
1030 }
1031
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV) const1032 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1033 Value *OldV,
1034 Value *NewV) const {
1035 auto IntrID = II->getIntrinsicID();
1036 switch (IntrID) {
1037 case Intrinsic::amdgcn_atomic_inc:
1038 case Intrinsic::amdgcn_atomic_dec:
1039 case Intrinsic::amdgcn_ds_fadd:
1040 case Intrinsic::amdgcn_ds_fmin:
1041 case Intrinsic::amdgcn_ds_fmax: {
1042 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1043 if (!IsVolatile->isZero())
1044 return nullptr;
1045 Module *M = II->getParent()->getParent()->getParent();
1046 Type *DestTy = II->getType();
1047 Type *SrcTy = NewV->getType();
1048 Function *NewDecl =
1049 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1050 II->setArgOperand(0, NewV);
1051 II->setCalledFunction(NewDecl);
1052 return II;
1053 }
1054 case Intrinsic::amdgcn_is_shared:
1055 case Intrinsic::amdgcn_is_private: {
1056 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1057 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1058 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1059 LLVMContext &Ctx = NewV->getType()->getContext();
1060 ConstantInt *NewVal = (TrueAS == NewAS) ?
1061 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
1062 return NewVal;
1063 }
1064 case Intrinsic::ptrmask: {
1065 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1066 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1067 Value *MaskOp = II->getArgOperand(1);
1068 Type *MaskTy = MaskOp->getType();
1069
1070 bool DoTruncate = false;
1071
1072 const GCNTargetMachine &TM =
1073 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1074 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1075 // All valid 64-bit to 32-bit casts work by chopping off the high
1076 // bits. Any masking only clearing the low bits will also apply in the new
1077 // address space.
1078 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1079 DL.getPointerSizeInBits(NewAS) != 32)
1080 return nullptr;
1081
1082 // TODO: Do we need to thread more context in here?
1083 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1084 if (Known.countMinLeadingOnes() < 32)
1085 return nullptr;
1086
1087 DoTruncate = true;
1088 }
1089
1090 IRBuilder<> B(II);
1091 if (DoTruncate) {
1092 MaskTy = B.getInt32Ty();
1093 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1094 }
1095
1096 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1097 {NewV, MaskOp});
1098 }
1099 case Intrinsic::amdgcn_flat_atomic_fadd:
1100 case Intrinsic::amdgcn_flat_atomic_fmax:
1101 case Intrinsic::amdgcn_flat_atomic_fmin: {
1102 Module *M = II->getParent()->getParent()->getParent();
1103 Type *DestTy = II->getType();
1104 Type *SrcTy = NewV->getType();
1105 Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
1106 {DestTy, SrcTy, DestTy});
1107 II->setArgOperand(0, NewV);
1108 II->setCalledFunction(NewDecl);
1109 return II;
1110 }
1111 default:
1112 return nullptr;
1113 }
1114 }
1115
getShuffleCost(TTI::ShuffleKind Kind,VectorType * VT,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args)1116 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1117 VectorType *VT, ArrayRef<int> Mask,
1118 TTI::TargetCostKind CostKind,
1119 int Index, VectorType *SubTp,
1120 ArrayRef<const Value *> Args) {
1121 Kind = improveShuffleKindFromMask(Kind, Mask);
1122 if (ST->hasVOP3PInsts()) {
1123 if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1124 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1125 // With op_sel VOP3P instructions freely can access the low half or high
1126 // half of a register, so any swizzle is free.
1127
1128 switch (Kind) {
1129 case TTI::SK_Broadcast:
1130 case TTI::SK_Reverse:
1131 case TTI::SK_PermuteSingleSrc:
1132 return 0;
1133 default:
1134 break;
1135 }
1136 }
1137 }
1138
1139 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1140 }
1141
areInlineCompatible(const Function * Caller,const Function * Callee) const1142 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1143 const Function *Callee) const {
1144 const TargetMachine &TM = getTLI()->getTargetMachine();
1145 const GCNSubtarget *CallerST
1146 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1147 const GCNSubtarget *CalleeST
1148 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1149
1150 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1151 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1152
1153 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1154 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1155 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1156 return false;
1157
1158 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1159 // no way to support merge for backend defined attributes.
1160 AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1161 AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1162 if (!CallerMode.isInlineCompatible(CalleeMode))
1163 return false;
1164
1165 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1166 Callee->hasFnAttribute(Attribute::InlineHint))
1167 return true;
1168
1169 // Hack to make compile times reasonable.
1170 if (InlineMaxBB) {
1171 // Single BB does not increase total BB amount.
1172 if (Callee->size() == 1)
1173 return true;
1174 size_t BBSize = Caller->size() + Callee->size() - 1;
1175 return BBSize <= InlineMaxBB;
1176 }
1177
1178 return true;
1179 }
1180
adjustInliningThreshold(const CallBase * CB) const1181 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1182 // If we have a pointer to private array passed into a function
1183 // it will not be optimized out, leaving scratch usage.
1184 // Increase the inline threshold to allow inlining in this case.
1185 uint64_t AllocaSize = 0;
1186 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1187 for (Value *PtrArg : CB->args()) {
1188 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1189 if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1190 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
1191 continue;
1192
1193 PtrArg = getUnderlyingObject(PtrArg);
1194 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1195 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1196 continue;
1197 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1198 // If the amount of stack memory is excessive we will not be able
1199 // to get rid of the scratch anyway, bail out.
1200 if (AllocaSize > ArgAllocaCutoff) {
1201 AllocaSize = 0;
1202 break;
1203 }
1204 }
1205 }
1206 if (AllocaSize)
1207 return ArgAllocaCost;
1208 return 0;
1209 }
1210
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)1211 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1212 TTI::UnrollingPreferences &UP,
1213 OptimizationRemarkEmitter *ORE) {
1214 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1215 }
1216
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)1217 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1218 TTI::PeelingPreferences &PP) {
1219 CommonTTI.getPeelingPreferences(L, SE, PP);
1220 }
1221
get64BitInstrCost(TTI::TargetCostKind CostKind) const1222 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1223 return ST->hasFullRate64Ops()
1224 ? getFullRateInstrCost()
1225 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1226 : getQuarterRateInstrCost(CostKind);
1227 }
1228
1229 std::pair<InstructionCost, MVT>
getTypeLegalizationCost(Type * Ty) const1230 GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1231 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1232 auto Size = DL.getTypeSizeInBits(Ty);
1233 // Maximum load or store can handle 8 dwords for scalar and 4 for
1234 // vector ALU. Let's assume anything above 8 dwords is expensive
1235 // even if legal.
1236 if (Size <= 256)
1237 return Cost;
1238
1239 Cost.first += (Size + 255) / 256;
1240 return Cost;
1241 }
1242