10b57cec5SDimitry Andric //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric
90b57cec5SDimitry Andric #include "NVPTXTargetTransformInfo.h"
100b57cec5SDimitry Andric #include "NVPTXUtilities.h"
110b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
120b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
130b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
140b57cec5SDimitry Andric #include "llvm/CodeGen/BasicTTIImpl.h"
150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h"
160b57cec5SDimitry Andric #include "llvm/CodeGen/TargetLowering.h"
17480093f4SDimitry Andric #include "llvm/IR/IntrinsicsNVPTX.h"
180b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
19bdd1243dSDimitry Andric #include <optional>
200b57cec5SDimitry Andric using namespace llvm;
210b57cec5SDimitry Andric
220b57cec5SDimitry Andric #define DEBUG_TYPE "NVPTXtti"
230b57cec5SDimitry Andric
240b57cec5SDimitry Andric // Whether the given intrinsic reads threadIdx.x/y/z.
readsThreadIndex(const IntrinsicInst * II)250b57cec5SDimitry Andric static bool readsThreadIndex(const IntrinsicInst *II) {
260b57cec5SDimitry Andric switch (II->getIntrinsicID()) {
270b57cec5SDimitry Andric default: return false;
280b57cec5SDimitry Andric case Intrinsic::nvvm_read_ptx_sreg_tid_x:
290b57cec5SDimitry Andric case Intrinsic::nvvm_read_ptx_sreg_tid_y:
300b57cec5SDimitry Andric case Intrinsic::nvvm_read_ptx_sreg_tid_z:
310b57cec5SDimitry Andric return true;
320b57cec5SDimitry Andric }
330b57cec5SDimitry Andric }
340b57cec5SDimitry Andric
readsLaneId(const IntrinsicInst * II)350b57cec5SDimitry Andric static bool readsLaneId(const IntrinsicInst *II) {
360b57cec5SDimitry Andric return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
370b57cec5SDimitry Andric }
380b57cec5SDimitry Andric
390b57cec5SDimitry Andric // Whether the given intrinsic is an atomic instruction in PTX.
isNVVMAtomic(const IntrinsicInst * II)400b57cec5SDimitry Andric static bool isNVVMAtomic(const IntrinsicInst *II) {
410b57cec5SDimitry Andric switch (II->getIntrinsicID()) {
420b57cec5SDimitry Andric default: return false;
430b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_load_inc_32:
440b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_load_dec_32:
450b57cec5SDimitry Andric
460b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_add_gen_f_cta:
470b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_add_gen_f_sys:
480b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_add_gen_i_cta:
490b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_add_gen_i_sys:
500b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_and_gen_i_cta:
510b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_and_gen_i_sys:
520b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_cas_gen_i_cta:
530b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_cas_gen_i_sys:
540b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_dec_gen_i_cta:
550b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_dec_gen_i_sys:
560b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_inc_gen_i_cta:
570b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_inc_gen_i_sys:
580b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_max_gen_i_cta:
590b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_max_gen_i_sys:
600b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_min_gen_i_cta:
610b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_min_gen_i_sys:
620b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_or_gen_i_cta:
630b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_or_gen_i_sys:
640b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_exch_gen_i_cta:
650b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_exch_gen_i_sys:
660b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_xor_gen_i_cta:
670b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_xor_gen_i_sys:
680b57cec5SDimitry Andric return true;
690b57cec5SDimitry Andric }
700b57cec5SDimitry Andric }
710b57cec5SDimitry Andric
isSourceOfDivergence(const Value * V)720b57cec5SDimitry Andric bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
730b57cec5SDimitry Andric // Without inter-procedural analysis, we conservatively assume that arguments
740b57cec5SDimitry Andric // to __device__ functions are divergent.
750b57cec5SDimitry Andric if (const Argument *Arg = dyn_cast<Argument>(V))
760b57cec5SDimitry Andric return !isKernelFunction(*Arg->getParent());
770b57cec5SDimitry Andric
780b57cec5SDimitry Andric if (const Instruction *I = dyn_cast<Instruction>(V)) {
790b57cec5SDimitry Andric // Without pointer analysis, we conservatively assume values loaded from
800b57cec5SDimitry Andric // generic or local address space are divergent.
810b57cec5SDimitry Andric if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
820b57cec5SDimitry Andric unsigned AS = LI->getPointerAddressSpace();
830b57cec5SDimitry Andric return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
840b57cec5SDimitry Andric }
850b57cec5SDimitry Andric // Atomic instructions may cause divergence. Atomic instructions are
860b57cec5SDimitry Andric // executed sequentially across all threads in a warp. Therefore, an earlier
870b57cec5SDimitry Andric // executed thread may see different memory inputs than a later executed
880b57cec5SDimitry Andric // thread. For example, suppose *a = 0 initially.
890b57cec5SDimitry Andric //
900b57cec5SDimitry Andric // atom.global.add.s32 d, [a], 1
910b57cec5SDimitry Andric //
920b57cec5SDimitry Andric // returns 0 for the first thread that enters the critical region, and 1 for
930b57cec5SDimitry Andric // the second thread.
940b57cec5SDimitry Andric if (I->isAtomic())
950b57cec5SDimitry Andric return true;
960b57cec5SDimitry Andric if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
970b57cec5SDimitry Andric // Instructions that read threadIdx are obviously divergent.
980b57cec5SDimitry Andric if (readsThreadIndex(II) || readsLaneId(II))
990b57cec5SDimitry Andric return true;
10081ad6265SDimitry Andric // Handle the NVPTX atomic intrinsics that cannot be represented as an
1010b57cec5SDimitry Andric // atomic IR instruction.
1020b57cec5SDimitry Andric if (isNVVMAtomic(II))
1030b57cec5SDimitry Andric return true;
1040b57cec5SDimitry Andric }
1050b57cec5SDimitry Andric // Conservatively consider the return value of function calls as divergent.
1060b57cec5SDimitry Andric // We could analyze callees with bodies more precisely using
1070b57cec5SDimitry Andric // inter-procedural analysis.
1080b57cec5SDimitry Andric if (isa<CallInst>(I))
1090b57cec5SDimitry Andric return true;
1100b57cec5SDimitry Andric }
1110b57cec5SDimitry Andric
1120b57cec5SDimitry Andric return false;
1130b57cec5SDimitry Andric }
1140b57cec5SDimitry Andric
115e8d8bef9SDimitry Andric // Convert NVVM intrinsics to target-generic LLVM code where possible.
simplifyNvvmIntrinsic(IntrinsicInst * II,InstCombiner & IC)116e8d8bef9SDimitry Andric static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
117e8d8bef9SDimitry Andric // Each NVVM intrinsic we can simplify can be replaced with one of:
118e8d8bef9SDimitry Andric //
119e8d8bef9SDimitry Andric // * an LLVM intrinsic,
120e8d8bef9SDimitry Andric // * an LLVM cast operation,
121e8d8bef9SDimitry Andric // * an LLVM binary operation, or
122e8d8bef9SDimitry Andric // * ad-hoc LLVM IR for the particular operation.
123e8d8bef9SDimitry Andric
124e8d8bef9SDimitry Andric // Some transformations are only valid when the module's
125e8d8bef9SDimitry Andric // flush-denormals-to-zero (ftz) setting is true/false, whereas other
126e8d8bef9SDimitry Andric // transformations are valid regardless of the module's ftz setting.
127e8d8bef9SDimitry Andric enum FtzRequirementTy {
128e8d8bef9SDimitry Andric FTZ_Any, // Any ftz setting is ok.
129e8d8bef9SDimitry Andric FTZ_MustBeOn, // Transformation is valid only if ftz is on.
130e8d8bef9SDimitry Andric FTZ_MustBeOff, // Transformation is valid only if ftz is off.
131e8d8bef9SDimitry Andric };
132e8d8bef9SDimitry Andric // Classes of NVVM intrinsics that can't be replaced one-to-one with a
133e8d8bef9SDimitry Andric // target-generic intrinsic, cast op, or binary op but that we can nonetheless
134e8d8bef9SDimitry Andric // simplify.
135e8d8bef9SDimitry Andric enum SpecialCase {
136e8d8bef9SDimitry Andric SPC_Reciprocal,
137e8d8bef9SDimitry Andric };
138e8d8bef9SDimitry Andric
139e8d8bef9SDimitry Andric // SimplifyAction is a poor-man's variant (plus an additional flag) that
140e8d8bef9SDimitry Andric // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
141e8d8bef9SDimitry Andric struct SimplifyAction {
142e8d8bef9SDimitry Andric // Invariant: At most one of these Optionals has a value.
143bdd1243dSDimitry Andric std::optional<Intrinsic::ID> IID;
144bdd1243dSDimitry Andric std::optional<Instruction::CastOps> CastOp;
145bdd1243dSDimitry Andric std::optional<Instruction::BinaryOps> BinaryOp;
146bdd1243dSDimitry Andric std::optional<SpecialCase> Special;
147e8d8bef9SDimitry Andric
148e8d8bef9SDimitry Andric FtzRequirementTy FtzRequirement = FTZ_Any;
14981ad6265SDimitry Andric // Denormal handling is guarded by different attributes depending on the
15081ad6265SDimitry Andric // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
15181ad6265SDimitry Andric bool IsHalfTy = false;
152e8d8bef9SDimitry Andric
153e8d8bef9SDimitry Andric SimplifyAction() = default;
154e8d8bef9SDimitry Andric
15581ad6265SDimitry Andric SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
15681ad6265SDimitry Andric bool IsHalfTy = false)
15781ad6265SDimitry Andric : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
158e8d8bef9SDimitry Andric
159e8d8bef9SDimitry Andric // Cast operations don't have anything to do with FTZ, so we skip that
160e8d8bef9SDimitry Andric // argument.
161e8d8bef9SDimitry Andric SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
162e8d8bef9SDimitry Andric
163e8d8bef9SDimitry Andric SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
164e8d8bef9SDimitry Andric : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
165e8d8bef9SDimitry Andric
166e8d8bef9SDimitry Andric SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
167e8d8bef9SDimitry Andric : Special(Special), FtzRequirement(FtzReq) {}
168e8d8bef9SDimitry Andric };
169e8d8bef9SDimitry Andric
170e8d8bef9SDimitry Andric // Try to generate a SimplifyAction describing how to replace our
171e8d8bef9SDimitry Andric // IntrinsicInstr with target-generic LLVM IR.
172e8d8bef9SDimitry Andric const SimplifyAction Action = [II]() -> SimplifyAction {
173e8d8bef9SDimitry Andric switch (II->getIntrinsicID()) {
174e8d8bef9SDimitry Andric // NVVM intrinsics that map directly to LLVM intrinsics.
175e8d8bef9SDimitry Andric case Intrinsic::nvvm_ceil_d:
176e8d8bef9SDimitry Andric return {Intrinsic::ceil, FTZ_Any};
177e8d8bef9SDimitry Andric case Intrinsic::nvvm_ceil_f:
178e8d8bef9SDimitry Andric return {Intrinsic::ceil, FTZ_MustBeOff};
179e8d8bef9SDimitry Andric case Intrinsic::nvvm_ceil_ftz_f:
180e8d8bef9SDimitry Andric return {Intrinsic::ceil, FTZ_MustBeOn};
181e8d8bef9SDimitry Andric case Intrinsic::nvvm_fabs_d:
182e8d8bef9SDimitry Andric return {Intrinsic::fabs, FTZ_Any};
183e8d8bef9SDimitry Andric case Intrinsic::nvvm_floor_d:
184e8d8bef9SDimitry Andric return {Intrinsic::floor, FTZ_Any};
185e8d8bef9SDimitry Andric case Intrinsic::nvvm_floor_f:
186e8d8bef9SDimitry Andric return {Intrinsic::floor, FTZ_MustBeOff};
187e8d8bef9SDimitry Andric case Intrinsic::nvvm_floor_ftz_f:
188e8d8bef9SDimitry Andric return {Intrinsic::floor, FTZ_MustBeOn};
189e8d8bef9SDimitry Andric case Intrinsic::nvvm_fma_rn_d:
190e8d8bef9SDimitry Andric return {Intrinsic::fma, FTZ_Any};
191e8d8bef9SDimitry Andric case Intrinsic::nvvm_fma_rn_f:
192e8d8bef9SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff};
193e8d8bef9SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_f:
194e8d8bef9SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn};
19581ad6265SDimitry Andric case Intrinsic::nvvm_fma_rn_f16:
19681ad6265SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff, true};
19781ad6265SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_f16:
19881ad6265SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn, true};
19981ad6265SDimitry Andric case Intrinsic::nvvm_fma_rn_f16x2:
20081ad6265SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff, true};
20181ad6265SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_f16x2:
20281ad6265SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn, true};
203*06c3fb27SDimitry Andric case Intrinsic::nvvm_fma_rn_bf16:
204*06c3fb27SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff, true};
205*06c3fb27SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_bf16:
206*06c3fb27SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn, true};
207*06c3fb27SDimitry Andric case Intrinsic::nvvm_fma_rn_bf16x2:
208*06c3fb27SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff, true};
209*06c3fb27SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_bf16x2:
210*06c3fb27SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn, true};
211e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmax_d:
212e8d8bef9SDimitry Andric return {Intrinsic::maxnum, FTZ_Any};
213e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmax_f:
214e8d8bef9SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOff};
215e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmax_ftz_f:
216e8d8bef9SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOn};
21781ad6265SDimitry Andric case Intrinsic::nvvm_fmax_nan_f:
21881ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOff};
21981ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_nan_f:
22081ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOn};
22181ad6265SDimitry Andric case Intrinsic::nvvm_fmax_f16:
22281ad6265SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOff, true};
22381ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_f16:
22481ad6265SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOn, true};
22581ad6265SDimitry Andric case Intrinsic::nvvm_fmax_f16x2:
22681ad6265SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOff, true};
22781ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_f16x2:
22881ad6265SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOn, true};
22981ad6265SDimitry Andric case Intrinsic::nvvm_fmax_nan_f16:
23081ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOff, true};
23181ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_nan_f16:
23281ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOn, true};
23381ad6265SDimitry Andric case Intrinsic::nvvm_fmax_nan_f16x2:
23481ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOff, true};
23581ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
23681ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOn, true};
237e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmin_d:
238e8d8bef9SDimitry Andric return {Intrinsic::minnum, FTZ_Any};
239e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmin_f:
240e8d8bef9SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOff};
241e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmin_ftz_f:
242e8d8bef9SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOn};
24381ad6265SDimitry Andric case Intrinsic::nvvm_fmin_nan_f:
24481ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOff};
24581ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_nan_f:
24681ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOn};
24781ad6265SDimitry Andric case Intrinsic::nvvm_fmin_f16:
24881ad6265SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOff, true};
24981ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_f16:
25081ad6265SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOn, true};
25181ad6265SDimitry Andric case Intrinsic::nvvm_fmin_f16x2:
25281ad6265SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOff, true};
25381ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_f16x2:
25481ad6265SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOn, true};
25581ad6265SDimitry Andric case Intrinsic::nvvm_fmin_nan_f16:
25681ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOff, true};
25781ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_nan_f16:
25881ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOn, true};
25981ad6265SDimitry Andric case Intrinsic::nvvm_fmin_nan_f16x2:
26081ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOff, true};
26181ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
26281ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOn, true};
263e8d8bef9SDimitry Andric case Intrinsic::nvvm_sqrt_rn_d:
264e8d8bef9SDimitry Andric return {Intrinsic::sqrt, FTZ_Any};
265e8d8bef9SDimitry Andric case Intrinsic::nvvm_sqrt_f:
266e8d8bef9SDimitry Andric // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
267e8d8bef9SDimitry Andric // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
268e8d8bef9SDimitry Andric // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
269e8d8bef9SDimitry Andric // the versions with explicit ftz-ness.
270e8d8bef9SDimitry Andric return {Intrinsic::sqrt, FTZ_Any};
271e8d8bef9SDimitry Andric case Intrinsic::nvvm_trunc_d:
272e8d8bef9SDimitry Andric return {Intrinsic::trunc, FTZ_Any};
273e8d8bef9SDimitry Andric case Intrinsic::nvvm_trunc_f:
274e8d8bef9SDimitry Andric return {Intrinsic::trunc, FTZ_MustBeOff};
275e8d8bef9SDimitry Andric case Intrinsic::nvvm_trunc_ftz_f:
276e8d8bef9SDimitry Andric return {Intrinsic::trunc, FTZ_MustBeOn};
277e8d8bef9SDimitry Andric
278e8d8bef9SDimitry Andric // NVVM intrinsics that map to LLVM cast operations.
279e8d8bef9SDimitry Andric //
280e8d8bef9SDimitry Andric // Note that llvm's target-generic conversion operators correspond to the rz
281e8d8bef9SDimitry Andric // (round to zero) versions of the nvvm conversion intrinsics, even though
282e8d8bef9SDimitry Andric // most everything else here uses the rn (round to nearest even) nvvm ops.
283e8d8bef9SDimitry Andric case Intrinsic::nvvm_d2i_rz:
284e8d8bef9SDimitry Andric case Intrinsic::nvvm_f2i_rz:
285e8d8bef9SDimitry Andric case Intrinsic::nvvm_d2ll_rz:
286e8d8bef9SDimitry Andric case Intrinsic::nvvm_f2ll_rz:
287e8d8bef9SDimitry Andric return {Instruction::FPToSI};
288e8d8bef9SDimitry Andric case Intrinsic::nvvm_d2ui_rz:
289e8d8bef9SDimitry Andric case Intrinsic::nvvm_f2ui_rz:
290e8d8bef9SDimitry Andric case Intrinsic::nvvm_d2ull_rz:
291e8d8bef9SDimitry Andric case Intrinsic::nvvm_f2ull_rz:
292e8d8bef9SDimitry Andric return {Instruction::FPToUI};
293e8d8bef9SDimitry Andric case Intrinsic::nvvm_i2d_rz:
294e8d8bef9SDimitry Andric case Intrinsic::nvvm_i2f_rz:
295e8d8bef9SDimitry Andric case Intrinsic::nvvm_ll2d_rz:
296e8d8bef9SDimitry Andric case Intrinsic::nvvm_ll2f_rz:
297e8d8bef9SDimitry Andric return {Instruction::SIToFP};
298e8d8bef9SDimitry Andric case Intrinsic::nvvm_ui2d_rz:
299e8d8bef9SDimitry Andric case Intrinsic::nvvm_ui2f_rz:
300e8d8bef9SDimitry Andric case Intrinsic::nvvm_ull2d_rz:
301e8d8bef9SDimitry Andric case Intrinsic::nvvm_ull2f_rz:
302e8d8bef9SDimitry Andric return {Instruction::UIToFP};
303e8d8bef9SDimitry Andric
304e8d8bef9SDimitry Andric // NVVM intrinsics that map to LLVM binary ops.
305e8d8bef9SDimitry Andric case Intrinsic::nvvm_div_rn_d:
306e8d8bef9SDimitry Andric return {Instruction::FDiv, FTZ_Any};
307e8d8bef9SDimitry Andric
308e8d8bef9SDimitry Andric // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
309e8d8bef9SDimitry Andric // need special handling.
310e8d8bef9SDimitry Andric //
311e8d8bef9SDimitry Andric // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
312e8d8bef9SDimitry Andric // as well.
313e8d8bef9SDimitry Andric case Intrinsic::nvvm_rcp_rn_d:
314e8d8bef9SDimitry Andric return {SPC_Reciprocal, FTZ_Any};
315e8d8bef9SDimitry Andric
316e8d8bef9SDimitry Andric // We do not currently simplify intrinsics that give an approximate
317e8d8bef9SDimitry Andric // answer. These include:
318e8d8bef9SDimitry Andric //
319e8d8bef9SDimitry Andric // - nvvm_cos_approx_{f,ftz_f}
320e8d8bef9SDimitry Andric // - nvvm_ex2_approx_{d,f,ftz_f}
321e8d8bef9SDimitry Andric // - nvvm_lg2_approx_{d,f,ftz_f}
322e8d8bef9SDimitry Andric // - nvvm_sin_approx_{f,ftz_f}
323e8d8bef9SDimitry Andric // - nvvm_sqrt_approx_{f,ftz_f}
324e8d8bef9SDimitry Andric // - nvvm_rsqrt_approx_{d,f,ftz_f}
325e8d8bef9SDimitry Andric // - nvvm_div_approx_{ftz_d,ftz_f,f}
326e8d8bef9SDimitry Andric // - nvvm_rcp_approx_ftz_d
327e8d8bef9SDimitry Andric //
328e8d8bef9SDimitry Andric // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
329e8d8bef9SDimitry Andric // means that fastmath is enabled in the intrinsic. Unfortunately only
330e8d8bef9SDimitry Andric // binary operators (currently) have a fastmath bit in SelectionDAG, so
331e8d8bef9SDimitry Andric // this information gets lost and we can't select on it.
332e8d8bef9SDimitry Andric //
333e8d8bef9SDimitry Andric // TODO: div and rcp are lowered to a binary op, so these we could in
334e8d8bef9SDimitry Andric // theory lower them to "fast fdiv".
335e8d8bef9SDimitry Andric
336e8d8bef9SDimitry Andric default:
337e8d8bef9SDimitry Andric return {};
338e8d8bef9SDimitry Andric }
339e8d8bef9SDimitry Andric }();
340e8d8bef9SDimitry Andric
341e8d8bef9SDimitry Andric // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
342e8d8bef9SDimitry Andric // can bail out now. (Notice that in the case that IID is not an NVVM
343e8d8bef9SDimitry Andric // intrinsic, we don't have to look up any module metadata, as
344e8d8bef9SDimitry Andric // FtzRequirementTy will be FTZ_Any.)
345e8d8bef9SDimitry Andric if (Action.FtzRequirement != FTZ_Any) {
346bdd1243dSDimitry Andric // FIXME: Broken for f64
347bdd1243dSDimitry Andric DenormalMode Mode = II->getFunction()->getDenormalMode(
348bdd1243dSDimitry Andric Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle());
349bdd1243dSDimitry Andric bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign;
350e8d8bef9SDimitry Andric
351e8d8bef9SDimitry Andric if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
352e8d8bef9SDimitry Andric return nullptr;
353e8d8bef9SDimitry Andric }
354e8d8bef9SDimitry Andric
355e8d8bef9SDimitry Andric // Simplify to target-generic intrinsic.
356e8d8bef9SDimitry Andric if (Action.IID) {
357349cc55cSDimitry Andric SmallVector<Value *, 4> Args(II->args());
358e8d8bef9SDimitry Andric // All the target-generic intrinsics currently of interest to us have one
359e8d8bef9SDimitry Andric // type argument, equal to that of the nvvm intrinsic's argument.
360e8d8bef9SDimitry Andric Type *Tys[] = {II->getArgOperand(0)->getType()};
361e8d8bef9SDimitry Andric return CallInst::Create(
362e8d8bef9SDimitry Andric Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
363e8d8bef9SDimitry Andric }
364e8d8bef9SDimitry Andric
365e8d8bef9SDimitry Andric // Simplify to target-generic binary op.
366e8d8bef9SDimitry Andric if (Action.BinaryOp)
367e8d8bef9SDimitry Andric return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
368e8d8bef9SDimitry Andric II->getArgOperand(1), II->getName());
369e8d8bef9SDimitry Andric
370e8d8bef9SDimitry Andric // Simplify to target-generic cast op.
371e8d8bef9SDimitry Andric if (Action.CastOp)
372e8d8bef9SDimitry Andric return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
373e8d8bef9SDimitry Andric II->getName());
374e8d8bef9SDimitry Andric
375e8d8bef9SDimitry Andric // All that's left are the special cases.
376e8d8bef9SDimitry Andric if (!Action.Special)
377e8d8bef9SDimitry Andric return nullptr;
378e8d8bef9SDimitry Andric
379e8d8bef9SDimitry Andric switch (*Action.Special) {
380e8d8bef9SDimitry Andric case SPC_Reciprocal:
381e8d8bef9SDimitry Andric // Simplify reciprocal.
382e8d8bef9SDimitry Andric return BinaryOperator::Create(
383e8d8bef9SDimitry Andric Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
384e8d8bef9SDimitry Andric II->getArgOperand(0), II->getName());
385e8d8bef9SDimitry Andric }
386e8d8bef9SDimitry Andric llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
387e8d8bef9SDimitry Andric }
388e8d8bef9SDimitry Andric
389bdd1243dSDimitry Andric std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const390e8d8bef9SDimitry Andric NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
391e8d8bef9SDimitry Andric if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
392e8d8bef9SDimitry Andric return I;
393e8d8bef9SDimitry Andric }
394bdd1243dSDimitry Andric return std::nullopt;
395e8d8bef9SDimitry Andric }
396e8d8bef9SDimitry Andric
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)397fe6060f1SDimitry Andric InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
3985ffd83dbSDimitry Andric unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
399bdd1243dSDimitry Andric TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
400bdd1243dSDimitry Andric ArrayRef<const Value *> Args,
401480093f4SDimitry Andric const Instruction *CxtI) {
4020b57cec5SDimitry Andric // Legalize the type.
403bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4040b57cec5SDimitry Andric
4050b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode);
4060b57cec5SDimitry Andric
4070b57cec5SDimitry Andric switch (ISD) {
4080b57cec5SDimitry Andric default:
409bdd1243dSDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
410bdd1243dSDimitry Andric Op2Info);
4110b57cec5SDimitry Andric case ISD::ADD:
4120b57cec5SDimitry Andric case ISD::MUL:
4130b57cec5SDimitry Andric case ISD::XOR:
4140b57cec5SDimitry Andric case ISD::OR:
4150b57cec5SDimitry Andric case ISD::AND:
4160b57cec5SDimitry Andric // The machine code (SASS) simulates an i64 with two i32. Therefore, we
4170b57cec5SDimitry Andric // estimate that arithmetic operations on i64 are twice as expensive as
4180b57cec5SDimitry Andric // those on types that can fit into one machine register.
4190b57cec5SDimitry Andric if (LT.second.SimpleTy == MVT::i64)
4200b57cec5SDimitry Andric return 2 * LT.first;
4210b57cec5SDimitry Andric // Delegate other cases to the basic TTI.
422bdd1243dSDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
423bdd1243dSDimitry Andric Op2Info);
4240b57cec5SDimitry Andric }
4250b57cec5SDimitry Andric }
4260b57cec5SDimitry Andric
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)4270b57cec5SDimitry Andric void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
428349cc55cSDimitry Andric TTI::UnrollingPreferences &UP,
429349cc55cSDimitry Andric OptimizationRemarkEmitter *ORE) {
430349cc55cSDimitry Andric BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4310b57cec5SDimitry Andric
4320b57cec5SDimitry Andric // Enable partial unrolling and runtime unrolling, but reduce the
4330b57cec5SDimitry Andric // threshold. This partially unrolls small loops which are often
4340b57cec5SDimitry Andric // unrolled by the PTX to SASS compiler and unrolling earlier can be
4350b57cec5SDimitry Andric // beneficial.
4360b57cec5SDimitry Andric UP.Partial = UP.Runtime = true;
4370b57cec5SDimitry Andric UP.PartialThreshold = UP.Threshold / 4;
4380b57cec5SDimitry Andric }
4395ffd83dbSDimitry Andric
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)4405ffd83dbSDimitry Andric void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
4415ffd83dbSDimitry Andric TTI::PeelingPreferences &PP) {
4425ffd83dbSDimitry Andric BaseT::getPeelingPreferences(L, SE, PP);
4435ffd83dbSDimitry Andric }
444