Lines Matching +full:atomic +full:- +full:threshold +full:- +full:us

1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
37 #define DEBUG_TYPE "amdgpu-legalinfo"
47 "amdgpu-global-isel-new-legality",
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
122 // Increase the number of vector elements to reach the next multiple of 32-bit
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
364 // than 32-bits, we need to reduce to a 32-bit type.
373 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
474 // arguments. Therefore, we convert resource pointers - or vectors of them
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
547 // have 96-bit scalar loads.
565 return TLI->allowsMisalignedMemoryAccessesImpl(
590 // Paranoidly prevent us from doing this multiple times.
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
621 /// the form in which the value must be in order to be passed to the low-level
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
648 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
784 // was introduced at the same time as 16-bit operations.
823 // Clamp bit support was added in VI, along with 16-bit operations.
860 // on the SALU, RegBankSelect will be able to re-legalize.
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1242 // The 64-bit versions produce 32-bit results, but only on the SALU.
1267 // instructions expect. The hardware produces -1, but these produce the
1277 // The 64-bit versions produce 32-bit results, but only on the SALU.
1295 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1307 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1334 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1386 bool IsLoad) -> bool {
1421 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1458 [=](const LegalityQuery &Query) -> bool {
1471 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1475 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1476 // 64-bits.
1483 // with. These will ultimately produce 32-bit scalar shifts to extract the
1486 // For odd 16-bit element vectors, prefer to split those into pieces with
1487 // 16-bit vector parts.
1489 [=](const LegalityQuery &Query) -> bool {
1497 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1505 [=](const LegalityQuery &Query) -> bool {
1509 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1530 [=](const LegalityQuery &Query) -> bool {
1534 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1562 // The scalars will need to be re-legalized.
1579 // to the widest type. TODO: Account for alignment. As-is it
1608 [=](const LegalityQuery &Query) -> bool {
1617 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618 // 64-bits.
1641 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1643 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1645 Atomic.legalFor({{S64, LocalPtr}});
1647 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1650 Atomic.legalFor({{S32, GlobalPtr}});
1652 Atomic.legalFor({{S32, FlatPtr}});
1657 // TODO: Move atomic expansion into legalizer
1658 Atomic.legalFor({
1667 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1669 Atomic.legalFor({{V2BF16, GlobalPtr}});
1671 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1696 // TODO: Pointer types, any 32-bit or 64-bit vector
1727 // TODO: Support 16-bit shift amounts for all types
1730 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1731 // 32-bit amount.
1774 // Address space 8 pointers are 128-bit wide values, but the logic
1777 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1794 // For > 64-bit element types, try to turn this into a 64-bit
1811 // TODO: Clamp elements for 64-bit vectors?
1834 // Sub-vector(or single element) insert and extract.
1935 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1951 // TODO: Use 16-bit shifts if legal for 8-bit values?
1981 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1988 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1995 // Prefer to promote to s32 before lowering if we don't have 16-bit
2203 // Note: this register is somewhat broken. When used as a 32-bit operand,
2234 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2282 switch (Def->getOpcode()) {
2288 const ConstantInt *CI = Def->getOperand(1).getCImm();
2289 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2333 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2336 // Extract low 32-bits of the pointer.
2347 // Extract low 32-bits of the pointer.
2372 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2403 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2434 // TODO: Should this propagate fast-math-flags?
2470 // TODO: Should this propagate fast-math-flags?
2499 auto Const0 = B.buildConstant(S32, FractBits - 32);
2534 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2538 // Extend back to 64-bits.
2544 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2577 // TODO: Should this propagate fast-math-flags?
2628 // The basic idea of converting a floating point number into a pair of 32-bit
2632 // hif := floor(tf * 2^-32);
2633 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2640 // However, a 32-bit floating point number has only 23 bits mantissa and
2651 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2653 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2656 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2658 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2672 // r := xor({lo, hi}, sign) - sign;
2692 if (!MFI->getMode().IEEE)
2739 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2794 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2847 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2848 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2858 // constant, which is a pc-relative offset from the encoding of the $symbol
2869 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2875 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2886 if (!B.getMRI()->getRegClassOrNull(PCReg))
2887 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2919 "Must provide a 64-bit pointer type!");
2962 if (!MFI->isModuleEntryFunction() &&
2963 GV->getName() != "llvm.amdgcn.module.lds") {
2966 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2985 if (!TLI->shouldUseLDSConstAddress(GV)) {
2990 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2991 Type *Ty = GV->getValueType();
2993 // zero-sized type in other languages to declare the dynamic shared
2999 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3008 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3022 if (TLI->shouldEmitFixup(GV)) {
3028 if (TLI->shouldEmitPCReloc(GV)) {
3047 // Truncate if this is a 32-bit constant address.
3098 const LLT MemTy = MMO->getMemoryType();
3099 const Align MemAlign = MMO->getAlign();
3103 // Widen non-power-of-2 loads to the alignment if needed
3135 // (e.g. <3 x s32> -> <4 x s32>)
3140 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3183 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3186 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3224 switch (DefMI->getOpcode()) {
3226 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3236 if (DefMI->getOperand(0).getReg() == Src)
3241 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3291 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3295 LLT Ty = B.getMRI()->getType(Dst);
3384 const float c_log10 = 0x1.344134p-2f;
3385 const float cc_log10 = 0x1.09f79ep-26f;
3388 const float c_log = 0x1.62e42ep-1f;
3389 const float cc_log = 0x1.efa39ep-25f;
3401 const float ch_log10 = 0x1.344000p-2f;
3402 const float ct_log10 = 0x1.3509f6p-18f;
3405 const float ch_log = 0x1.62e000p-1f;
3406 const float ct_log = 0x1.0bfbe8p-15f;
3455 LLT Ty = B.getMRI()->getType(Dst);
3463 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3498 LLT Ty = B.getMRI()->getType(Dst);
3523 // bool needs_scaling = x < -0x1.f80000p+6f;
3524 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3526 // -nextafter(128.0, -1)
3527 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3540 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3550 LLT Ty = B.getMRI()->getType(Dst);
3568 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3570 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3582 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3609 // exp(f16 x) ->
3624 // library behavior. Also, is known-not-daz source sufficient?
3642 // f = x*(64/ln(2)) - n
3643 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3660 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3662 const float cc_exp10 = 0x1.2f346ep-24f;
3673 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3676 const float cl_exp10 = 0x1.4f0978p-11f;
3706 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3736 LLT Ty = B.getMRI()->getType(Dst);
3768 ModSrc = SrcFNeg->getOperand(1).getReg();
3770 ModSrc = SrcFAbs->getOperand(1).getReg();
3772 ModSrc = SrcFAbs->getOperand(1).getReg();
3788 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3794 // Convert floor(x) to (x - fract(x))
3815 if (MFI->getMode().IEEE)
3858 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3860 // Source and accumulation registers must all be 32-bits.
3887 auto getZero32 = [&]() -> Register {
3892 auto getZero64 = [&]() -> Register {
3904 // Merge the given carries into the 32-bit LocalAccum, which is modified
3905 // in-place.
3907 // Returns the carry-out, which is a single S1 register or null.
3909 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3942 // Build a multiply-add chain to compute
3947 // LocalAccum is an array of one or two 32-bit registers that are updated
3948 // in-place. The incoming registers may be null.
3950 // In some edge cases, carry-ins can be consumed "for free". In that case,
3951 // the consumed carry bits are removed from CarryIn in-place.
3954 -> Carry {
3961 // Use plain 32-bit multiplication for the most significant part of the
3967 unsigned j1 = DstIndex - j0;
3989 // Build full 64-bit multiplies.
4012 unsigned j1 = DstIndex - j0;
4043 // Dest index relative to 2 * i: 1 0 -1
4044 // ------
4046 // Even-aligned partial product sum: E E .
4047 // Odd-aligned partial product sum: O O
4069 // Partial products at offset 2 * i - 1.
4072 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4073 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4079 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4085 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4087 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4089 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4092 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4096 Lo->getOperand(1).getReg());
4105 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4116 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4119 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4145 // Whether to compute odd-aligned partial products separately. This is
4147 // in an even-aligned VGPR.
4169 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4199 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4208 // Check that this is a G_XOR x, -1
4213 return ConstVal && *ConstVal == -1;
4228 Register NegatedCond = UseMI->getOperand(0).getReg();
4239 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4243 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4244 if (Next == Parent->end()) {
4245 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4246 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4250 if (Next->getOpcode() != AMDGPU::G_BR)
4253 UncondBrTarget = Br->getOperand(0).getMBB();
4263 MCRegister SrcReg = Arg->getRegister();
4269 if (Arg->isMasked()) {
4272 const unsigned Mask = Arg->getMask();
4308 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4335 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4345 // It's undefined behavior if a function marked with the amdgpu-no-*
4351 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4384 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4388 // It's undefined behavior if a function marked with the amdgpu-no-*
4395 if (Arg->isMasked()) {
4416 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4437 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4520 // %mul2 = G_FMUL %mul1, 2**(-32)
4522 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4540 // 2**(-32)
4545 // -(2**32)
4722 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4800 // 1 / x -> RCP(x)
4801 if (CLHS->isExactlyValue(1.0)) {
4810 // -1 / x -> RCP( FNEG(x) )
4811 if (CLHS->isExactlyValue(-1.0)) {
4828 // x / y -> x * (1.0 / y)
4947 SIModeRegisterDefaults Mode = MFI->getMode();
5155 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5211 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5223 auto NegOne = B.buildConstant(I32, -1);
5260 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5283 // r0 = 0.5 - h0 * g0
5287 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5291 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5306 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5340 auto ScaleDownFactor = B.buildConstant(S32, -128);
5345 // with finite only or nsz because rsq(+/-0) = +/-inf
5350 // If x is +INF, +0, or -0, use its original value
5375 // +-max_float.
5403 const bool UseIEEE = MFI->getMode().IEEE;
5431 Register Src2, LLT VT) -> Register {
5543 ST.getTargetLowering()->getImplicitParameterOffset(
5558 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5583 if (!StrideConst || !StrideConst->Value.isZero()) {
5586 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5606 if (!MFI->isEntryFunction()) {
5635 if (!MFI->isEntryFunction()) {
5692 ImmOffset -= Overflow;
5727 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5749 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5760 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5780 LLT Ty = MRI->getType(VData);
5820 const int MemSize = MMO->getSize().getValue();
5884 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5908 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5919 const LLT MemTy = MMO->getMemoryType();
6012 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6016 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6024 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6032 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6039 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6046 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6144 llvm_unreachable("unhandled atomic opcode");
6158 // Since we don't have 128-bit atomics, we don't need to handle the case of
6159 // p8 argmunents to the atomic itself
6206 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6222 auto EndIdx = Intr->VAddrEnd;
6224 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6231 if ((I < Intr->GradientStart) ||
6232 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6233 (I >= Intr->CoordStart && !IsA16)) {
6234 if ((I < Intr->GradientStart) && IsA16 &&
6235 (B.getMRI()->getType(AddrReg) == S16)) {
6236 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6238 // occupies full 32-bit.
6243 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6253 ((Intr->NumGradients / 2) % 2 == 1 &&
6254 (I == static_cast<unsigned>(Intr->GradientStart +
6255 (Intr->NumGradients / 2) - 1) ||
6256 I == static_cast<unsigned>(Intr->GradientStart +
6257 Intr->NumGradients - 1))) ||
6285 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6305 /// Depending on the subtarget, load/store with 16-bit element data need to be
6306 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6307 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6329 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6340 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6342 Ty = MRI->getType(VData);
6346 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6347 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6351 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6353 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6355 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6360 if (!BaseOpcode->Atomic) {
6361 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6362 if (BaseOpcode->Gather4) {
6366 } else if (!IsTFE && !BaseOpcode->Store) {
6367 // If dmask is 0, this is a no-op load. This can be eliminated.
6382 if (BaseOpcode->Store)
6384 else if (BaseOpcode->NoReturn)
6390 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6395 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6398 if (BaseOpcode->Atomic) {
6400 LLT Ty = MRI->getType(VData0);
6402 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6406 if (BaseOpcode->AtomicX2) {
6416 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6419 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6430 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6434 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6440 // See also below in the non-a16 branch
6450 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6452 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6453 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6463 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6472 if (I - Intr->VAddrStart < NumPacked)
6473 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6479 // without introducing moves, then using the non-sequential address encoding
6485 // do so, so force non-NSA for the common 2-address case as a heuristic.
6487 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6500 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6501 Intr->NumVAddrs - NSAMaxSize + 1);
6502 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6503 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6504 Intr->NumVAddrs);
6515 if (BaseOpcode->NoReturn) { // No TFE for stores?
6539 // Image atomic instructions are using DMask to specify how many bits
6540 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6541 // DMaskLanes for image atomic has default value '0'.
6542 // We must be sure that atomic variants (especially packed) will not be
6554 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6593 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6604 if (MRI->getType(Dst1Reg) != S32)
6621 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6629 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6677 LLT ResTy = MRI->getType(ResultRegs[0]);
6679 padWithUndef(ResTy, NumElts - ResultRegs.size());
6701 if (MRI->getType(DstReg).getNumElements() <
6702 MRI->getType(NewResultReg).getNumElements()) {
6710 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6722 LLT Ty = B.getMRI()->getType(OrigDst);
6730 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6732 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6773 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6774 // always be legal. We may need to restore this to a 96-bit result if it turns
6816 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6817 MF->push_back(TrapBB);
6818 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6840 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6873 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6894 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6909 // Is non-HSA path or trap-handler disabled? Then, report a warning
6919 // Insert debug-trap instruction
6981 assert(Opcode != -1);
7105 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7194 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7199 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7213 Br->getOperand(0).setMBB(CondBrTarget);
7221 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7222 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7224 BrCond->eraseFromParent();
7239 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7245 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7251 Br->getOperand(0).setMBB(CondBrTarget);
7256 BrCond->eraseFromParent();
7257 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());