AMDGPULegalizerInfo.cpp - OpenGrok cross reference for /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines Matching +full:atomic +full:- +full:threshold +full:- +full:us
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
37 #define DEBUG_TYPE "amdgpu-legalinfo"
47   "amdgpu-global-isel-new-legality",
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
122 // Increase the number of vector elements to reach the next multiple of 32-bit
135     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
180     // <2 x s8> -> s16
181     // <4 x s8> -> s32
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
364 // than 32-bits, we need to reduce to a 32-bit type.
373 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
396     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
431   // Only 1-byte and 2-byte to 32-bit extloads are valid.
463     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
474 // arguments. Therefore, we convert resource pointers - or vectors of them
545   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
547   // have 96-bit scalar loads.
565   return TLI->allowsMisalignedMemoryAccessesImpl(
590   // Paranoidly prevent us from doing this multiple times.
597     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
621 /// the form in which the value must be in order to be passed to the low-level
632     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
648   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649   // Paranoidly prevent us from doing this multiple times.
784     // was introduced at the same time as 16-bit operations.
823       // Clamp bit support was added in VI, along with 16-bit operations.
860   // on the SALU, RegBankSelect will be able to re-legalize.
1108   // TODO: Split s1->s64 during regbankselect for VALU.
1242   // The 64-bit versions produce 32-bit results, but only on the SALU.
1267   // instructions expect. The hardware produces -1, but these produce the
1277   // The 64-bit versions produce 32-bit results, but only on the SALU.
1295   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1307       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1334       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1386                                     bool IsLoad) -> bool {
1421   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1458       [=](const LegalityQuery &Query) -> bool {
1471     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1475     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1476     // 64-bits.
1483     // with. These will ultimately produce 32-bit scalar shifts to extract the
1486     // For odd 16-bit element vectors, prefer to split those into pieces with
1487     // 16-bit vector parts.
1489       [=](const LegalityQuery &Query) -> bool {
1497       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1505             [=](const LegalityQuery &Query) -> bool {
1509             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1530             [=](const LegalityQuery &Query) -> bool {
1534             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1562                 // The scalars will need to be re-legalized.
1579                 // to the widest type. TODO: Account for alignment. As-is it
1608                          [=](const LegalityQuery &Query) -> bool {
1617   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618   // 64-bits.
1641   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1643     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1645       Atomic.legalFor({{S64, LocalPtr}});
1647       Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1650     Atomic.legalFor({{S32, GlobalPtr}});
1652     Atomic.legalFor({{S32, FlatPtr}});
1657     // TODO: Move atomic expansion into legalizer
1658     Atomic.legalFor({
1667     Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1669     Atomic.legalFor({{V2BF16, GlobalPtr}});
1671     Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1696   // TODO: Pointer types, any 32-bit or 64-bit vector
1727     // TODO: Support 16-bit shift amounts for all types
1730         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1731         // 32-bit amount.
1774           // Address space 8 pointers are 128-bit wide values, but the logic
1777           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1794           // For > 64-bit element types, try to turn this into a 64-bit
1811       // TODO: Clamp elements for 64-bit vectors?
1834           // Sub-vector(or single element) insert and extract.
1935       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1951         // TODO: Use 16-bit shifts if legal for 8-bit values?
1981   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1988       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1995     // Prefer to promote to s32 before lowering if we don't have 16-bit
2203     // Note: this register is somewhat broken. When used as a 32-bit operand,
2234         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2282   switch (Def->getOpcode()) {
2288     const ConstantInt *CI = Def->getOperand(1).getCImm();
2289     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2333     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2336       // Extract low 32-bits of the pointer.
2347     // Extract low 32-bits of the pointer.
2372     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2403     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2434   // TODO: Should this propagate fast-math-flags?
2470   // TODO: Should this propagate fast-math-flags?
2499   auto Const0 = B.buildConstant(S32, FractBits - 32);
2534   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2538   // Extend back to 64-bits.
2544   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2577     // TODO: Should this propagate fast-math-flags?
2628   // The basic idea of converting a floating point number into a pair of 32-bit
2632   //    hif := floor(tf * 2^-32);
2633   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2640     // However, a 32-bit floating point number has only 23 bits mantissa and
2651         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2653         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2656         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2658         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2672     // r := xor({lo, hi}, sign) - sign;
2692   if (!MFI->getMode().IEEE)
2739   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2794   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2847   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2848   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2858   //   constant, which is a pc-relative offset from the encoding of the $symbol
2869   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2875     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2886   if (!B.getMRI()->getRegClassOrNull(PCReg))
2887     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2919            "Must provide a 64-bit pointer type!");
2962     if (!MFI->isModuleEntryFunction() &&
2963         GV->getName() != "llvm.amdgcn.module.lds") {
2966         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2985     if (!TLI->shouldUseLDSConstAddress(GV)) {
2990     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2991       Type *Ty = GV->getValueType();
2993       // zero-sized type in other languages to declare the dynamic shared
2999         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3008     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3022   if (TLI->shouldEmitFixup(GV)) {
3028   if (TLI->shouldEmitPCReloc(GV)) {
3047     // Truncate if this is a 32-bit constant address.
3098   const LLT MemTy = MMO->getMemoryType();
3099   const Align MemAlign = MMO->getAlign();
3103   // Widen non-power-of-2 loads to the alignment if needed
3135         // (e.g. <3 x s32> -> <4 x s32>)
3140         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3183       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3186       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3224   switch (DefMI->getOpcode()) {
3226     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3236     if (DefMI->getOperand(0).getReg() == Src)
3241     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3291   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3295   LLT Ty = B.getMRI()->getType(Dst);
3384     const float c_log10 = 0x1.344134p-2f;
3385     const float cc_log10 = 0x1.09f79ep-26f;
3388     const float c_log = 0x1.62e42ep-1f;
3389     const float cc_log = 0x1.efa39ep-25f;
3401     const float ch_log10 = 0x1.344000p-2f;
3402     const float ct_log10 = 0x1.3509f6p-18f;
3405     const float ch_log = 0x1.62e000p-1f;
3406     const float ct_log = 0x1.0bfbe8p-15f;
3455   LLT Ty = B.getMRI()->getType(Dst);
3463       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3498   LLT Ty = B.getMRI()->getType(Dst);
3523   // bool needs_scaling = x < -0x1.f80000p+6f;
3524   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3526   // -nextafter(128.0, -1)
3527   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3540   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3550   LLT Ty = B.getMRI()->getType(Dst);
3568   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3570       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3582   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3609     // exp(f16 x) ->
3624   // library behavior. Also, is known-not-daz source sufficient?
3642   //    f = x*(64/ln(2)) - n
3643   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3660     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3662     const float cc_exp10 = 0x1.2f346ep-24f;
3673     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3676     const float cl_exp10 = 0x1.4f0978p-11f;
3706       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3736   LLT Ty = B.getMRI()->getType(Dst);
3768     ModSrc = SrcFNeg->getOperand(1).getReg();
3770       ModSrc = SrcFAbs->getOperand(1).getReg();
3772     ModSrc = SrcFAbs->getOperand(1).getReg();
3788   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3794   // Convert floor(x) to (x - fract(x))
3815   if (MFI->getMode().IEEE)
3858 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3860 // Source and accumulation registers must all be 32-bits.
3887   auto getZero32 = [&]() -> Register {
3892   auto getZero64 = [&]() -> Register {
3904   // Merge the given carries into the 32-bit LocalAccum, which is modified
3905   // in-place.
3907   // Returns the carry-out, which is a single S1 register or null.
3909       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3942   // Build a multiply-add chain to compute
3947   // LocalAccum is an array of one or two 32-bit registers that are updated
3948   // in-place. The incoming registers may be null.
3950   // In some edge cases, carry-ins can be consumed "for free". In that case,
3951   // the consumed carry bits are removed from CarryIn in-place.
3954           -> Carry {
3961         // Use plain 32-bit multiplication for the most significant part of the
3967             unsigned j1 = DstIndex - j0;
3989         // Build full 64-bit multiplies.
4012             unsigned j1 = DstIndex - j0;
4043   //   Dest index relative to 2 * i:      1 0 -1
4044   //                                      ------
4046   //   Even-aligned partial product sum:  E E .
4047   //   Odd-aligned partial product sum:     O O
4069     // Partial products at offset 2 * i - 1.
4072         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4073         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4079         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4085             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4087             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4089           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4092         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4096                                 Lo->getOperand(1).getReg());
4105       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4116 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4119 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4145   // Whether to compute odd-aligned partial products separately. This is
4147   // in an even-aligned VGPR.
4169 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4199   auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4208 // Check that this is a G_XOR x, -1
4213   return ConstVal && *ConstVal == -1;
4228     Register NegatedCond = UseMI->getOperand(0).getReg();
4239   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4243   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4244   if (Next == Parent->end()) {
4245     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4246     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4250     if (Next->getOpcode() != AMDGPU::G_BR)
4253     UncondBrTarget = Br->getOperand(0).getMBB();
4263   MCRegister SrcReg = Arg->getRegister();
4269   if (Arg->isMasked()) {
4272     const unsigned Mask = Arg->getMask();
4308       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4335     std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4345     // It's undefined behavior if a function marked with the amdgpu-no-*
4351   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4384   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4388     // It's undefined behavior if a function marked with the amdgpu-no-*
4395   if (Arg->isMasked()) {
4416   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4437   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4520 // %mul2 = G_FMUL %mul1, 2**(-32)
4522 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4540   // 2**(-32)
4545   // -(2**32)
4722   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4800     // 1 / x -> RCP(x)
4801     if (CLHS->isExactlyValue(1.0)) {
4810     // -1 / x -> RCP( FNEG(x) )
4811     if (CLHS->isExactlyValue(-1.0)) {
4828   // x / y -> x * (1.0 / y)
4947   SIModeRegisterDefaults Mode = MFI->getMode();
5155   auto C1 = B.buildFConstant(S32, 0x1p-32f);
5211   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5223     auto NegOne = B.buildConstant(I32, -1);
5260   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5283   //   r0 = 0.5 - h0 * g0
5287   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5291   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5306   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5340   auto ScaleDownFactor = B.buildConstant(S32, -128);
5345   // with finite only or nsz because rsq(+/-0) = +/-inf
5350   // If x is +INF, +0, or -0, use its original value
5375 // +-max_float.
5403   const bool UseIEEE = MFI->getMode().IEEE;
5431                                       Register Src2, LLT VT) -> Register {
5543     ST.getTargetLowering()->getImplicitParameterOffset(
5558 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5583   if (!StrideConst || !StrideConst->Value.isZero()) {
5586       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5606   if (!MFI->isEntryFunction()) {
5635   if (!MFI->isEntryFunction()) {
5692   ImmOffset -= Overflow;
5727     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5749       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5760       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5780   LLT Ty = MRI->getType(VData);
5820   const int MemSize = MMO->getSize().getValue();
5884      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5908       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5919   const LLT MemTy = MMO->getMemoryType();
6012     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6016       Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6024         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6032     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6039     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6046     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6144     llvm_unreachable("unhandled atomic opcode");
6158   // Since we don't have 128-bit atomics, we don't need to handle the case of
6159   // p8 argmunents to the atomic itself
6206      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6222   auto EndIdx = Intr->VAddrEnd;
6224   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6231     if ((I < Intr->GradientStart) ||
6232         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6233         (I >= Intr->CoordStart && !IsA16)) {
6234       if ((I < Intr->GradientStart) && IsA16 &&
6235           (B.getMRI()->getType(AddrReg) == S16)) {
6236         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6238         // occupies full 32-bit.
6243         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6253           ((Intr->NumGradients / 2) % 2 == 1 &&
6254            (I == static_cast<unsigned>(Intr->GradientStart +
6255                                        (Intr->NumGradients / 2) - 1) ||
6256             I == static_cast<unsigned>(Intr->GradientStart +
6257                                        Intr->NumGradients - 1))) ||
6285       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6305 /// Depending on the subtarget, load/store with 16-bit element data need to be
6306 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6307 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6329       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6340   if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6342     Ty = MRI->getType(VData);
6346       (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6347        BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6351       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6353       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6355       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6360   if (!BaseOpcode->Atomic) {
6361     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6362     if (BaseOpcode->Gather4) {
6366     } else if (!IsTFE && !BaseOpcode->Store) {
6367       // If dmask is 0, this is a no-op load. This can be eliminated.
6382   if (BaseOpcode->Store)
6384   else if (BaseOpcode->NoReturn)
6390   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6395     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6398   if (BaseOpcode->Atomic) {
6400     LLT Ty = MRI->getType(VData0);
6402     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6406     if (BaseOpcode->AtomicX2) {
6416   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6419   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6430   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6434     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6440     // See also below in the non-a16 branch
6450           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6452           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6453       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6463     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6472       if (I - Intr->VAddrStart < NumPacked)
6473         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6479     // without introducing moves, then using the non-sequential address encoding
6485     // do so, so force non-NSA for the common 2-address case as a heuristic.
6487     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6500                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6501                                Intr->NumVAddrs - NSAMaxSize + 1);
6502     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6503       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6504                                Intr->NumVAddrs);
6515   if (BaseOpcode->NoReturn) { // No TFE for stores?
6539   // Image atomic instructions are using DMask to specify how many bits
6540   // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6541   // DMaskLanes for image atomic has default value '0'.
6542   // We must be sure that atomic variants (especially packed) will not be
6554   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6593   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6604     if (MRI->getType(Dst1Reg) != S32)
6621   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6629       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6677   LLT ResTy = MRI->getType(ResultRegs[0]);
6679     padWithUndef(ResTy, NumElts - ResultRegs.size());
6701     if (MRI->getType(DstReg).getNumElements() <
6702         MRI->getType(NewResultReg).getNumElements()) {
6710   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6722   LLT Ty = B.getMRI()->getType(OrigDst);
6730     // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6732     Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6773   // If we don't have 96-bit result scalar loads, widening to 128-bit should
6774   // always be legal. We may need to restore this to a 96-bit result if it turns
6816   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6817   MF->push_back(TrapBB);
6818   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6840         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6873   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6894     ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6909   // Is non-HSA path or trap-handler disabled? Then, report a warning
6919     // Insert debug-trap instruction
6981   assert(Opcode != -1);
7105   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7194       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7199       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7213         Br->getOperand(0).setMBB(CondBrTarget);
7221       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7222       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7224       BrCond->eraseFromParent();
7239       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7245       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7251         Br->getOperand(0).setMBB(CondBrTarget);
7256       BrCond->eraseFromParent();
7257       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());