Lines Matching +full:lo +full:- +full:x2 +full:- +full:en

1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
49 #define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
59 "amdgpu-use-divergent-register-indexing",
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
183 computeRegisterProperties(Subtarget->getRegisterInfo());
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
350 // Most operations are naturally 32-bit vector operations. We only support
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
469 if (Subtarget->hasSMemRealTime() ||
470 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
474 if (Subtarget->has16BitInsts()) {
481 if (Subtarget->hasMadMacF32Insts())
484 if (!Subtarget->hasBFI())
488 if (!Subtarget->hasBCNT(32))
491 if (!Subtarget->hasBCNT(64))
494 if (Subtarget->hasFFBH())
497 if (Subtarget->hasFFBL())
500 // We only really have 32-bit BFE instructions (and 16-bit on VI).
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
508 if (Subtarget->hasBFE())
512 if (Subtarget->hasIntClamp())
515 if (Subtarget->hasAddNoCarry())
528 if (Subtarget->haveRoundOpsF64())
551 if (Subtarget->has16BitInsts()) {
582 // F16 - Constant Actions.
586 // F16 - Load/Store Actions.
592 // BF16 - Load/Store Actions.
598 // F16 - VOP1 Actions.
606 // F16 - VOP2 Actions.
613 // F16 - VOP3 Actions.
650 // XXX - Do these do anything? Vector constants turn into build_vector.
745 Subtarget->hasVOP3PInsts() ? Legal : Custom);
773 if (Subtarget->hasVOP3PInsts()) {
811 if (Subtarget->hasPackedFP32Ops()) {
822 if (Subtarget->has16BitInsts()) {
843 if (Subtarget->hasScalarSMulU64())
846 if (Subtarget->hasMad64_32())
849 if (Subtarget->hasPrefetch())
852 if (Subtarget->hasIEEEMinMax()) {
925 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
955 // FIXME: In other contexts we pretend this is a per-function property.
970 //===----------------------------------------------------------------------===//
972 //===----------------------------------------------------------------------===//
980 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
991 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1014 if (Subtarget->has16BitInsts()) {
1023 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1044 // FIXME: Should probably promote 8-bit vectors to i16.
1045 if (Size == 16 && Subtarget->has16BitInsts())
1067 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1068 // support, but unless we can properly handle 3-vectors, it will be still be
1070 if (Size == 16 && Subtarget->has16BitInsts()) {
1089 if (Size < 16 && Subtarget->has16BitInsts()) {
1122 LLVMContext &Ctx = Ty->getContext();
1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1125 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1141 assert(ST->getNumContainedTypes() == 2 &&
1142 ST->getContainedType(1)->isIntegerTy(32));
1143 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1146 /// Map address space 7 to MVT::v5i32 because that's its in-memory
1147 /// representation. This return value is vector-typed because there is no
1151 /// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1161 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1163 /// The in-memory representation of a p9 is {p8, i32, i32}, which is
1194 if (RsrcIntr->IsImage) {
1197 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1201 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1202 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1203 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1212 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1213 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1217 if (RsrcIntr->IsImage) {
1220 if (!BaseOpcode->Gather4) {
1224 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1242 Type *DataTy = CI.getArgOperand(0)->getType();
1243 if (RsrcIntr->IsImage) {
1244 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1254 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1262 if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1266 // XXX - Should this be volatile without known ordering?
1268 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1275 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1303 if (!Vol->isZero())
1311 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1326 if (!Vol->isZero())
1396 Info.ptrVal = MFI->getGWSPSV(TM);
1411 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1424 Info.ptrVal = MFI->getGWSPSV(TM);
1445 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1446 unsigned DstAS = I.getType()->getPointerAddressSpace();
1460 switch (II->getIntrinsicID()) {
1482 Ptr = II->getArgOperand(0);
1485 Ptr = II->getArgOperand(1);
1490 AccessTy = II->getType();
1497 if (!Subtarget->hasFlatInstOffsets()) {
1509 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1514 if (Subtarget->hasFlatGlobalInsts())
1517 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1534 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1535 // additionally can do r + r + i with addr64. 32-bit has more addressing
1543 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1544 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1589 if (!Subtarget->hasScalarSubwordLoads()) {
1594 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1598 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1599 // SMRD instructions have an 8-bit, dword offset on SI.
1602 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1603 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1604 // in 8-bits, it can use a smaller encoding.
1607 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1608 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1611 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1612 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1617 // On GFX12, all offsets are signed 24-bit in bytes.
1625 // Scalar (non-buffer) loads can only use a negative offset if
1626 // soffset+offset is non-negative. Since the compiler can only prove that
1642 return Subtarget->enableFlatScratch()
1647 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1648 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1650 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1651 // an 8-bit dword offset but we don't know the alignment here.
1682 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1700 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1704 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1717 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1718 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1720 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1723 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1728 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1737 // operates with a speed comparable to N-bit wide load". With the full
1752 if (!Subtarget->hasDS96AndDS128())
1755 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1758 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1775 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1778 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1783 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1807 // Note that we have a single-dword or sub-dword here, so if underaligned
1813 Subtarget->hasUnalignedDSAccessEnabled();
1822 Subtarget->enableFlatScratch() ||
1823 Subtarget->hasUnalignedScratchAccess();
1830 !Subtarget->hasUnalignedScratchAccess()) {
1839 // than multiple smaller memory ops -- even when misaligned
1845 Subtarget->hasUnalignedBufferAccessEnabled();
1852 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1853 // byte-address are ignored, thus forcing Dword alignment.
1873 // use. Make sure we switch these to 64-bit accesses.
1888 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1898 // Flat -> private/local is a simple truncate.
1899 // Flat -> global is no-op
1911 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1938 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1943 // These operations are done with 32-bit instructions anyway.
1977 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1986 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2024 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2026 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2052 int64_t OffsetDiff = Offset - AlignDownOffset;
2151 if (Subtarget->hasArchitectedSGPRs() &&
2184 // It's undefined behavior if a function marked with the amdgpu-no-*
2200 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2205 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2206 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2211 if (Arg->Flags.isSplit()) {
2212 while (!Arg->Flags.isSplitEnd()) {
2213 assert((!Arg->VT.isVector() ||
2214 Arg->VT.getScalarSizeInBits() == 16) &&
2224 Skipped.set(Arg->getOrigArgIndex());
2229 Info->markPSInputAllocated(PSInputNum);
2230 if (Arg->Used)
2231 Info->markPSInputEnabled(PSInputNum);
2253 unsigned Mask = (Subtarget->hasPackedTID() &&
2260 if (Subtarget->hasPackedTID()) {
2274 if (Subtarget->hasPackedTID()) {
2318 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2421 // flat_scratch_init is not applicable for non-kernel functions.
2484 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2500 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2517 // Don't preload non-original args or parts not in the current preload
2537 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2541 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2556 if (PreloadRegs->size() > 1)
2580 // Allocate special input registers that are initialized per-wave.
2586 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2587 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2588 // Note: user SGPRs are handled by the front-end for graphics shaders
2657 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2671 // Record that we know we have non-spill stack objects so we don't need to
2712 // whereas non-entry functions get this "for free". This means there is no
2746 if (ST.getFrameLowering()->hasFP(MF)) {
2752 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2753 return !Info->isEntryFunction();
2763 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2765 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2769 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2770 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2771 MachineBasicBlock::iterator MBBI = Entry->begin();
2781 Register NewVR = MRI->createVirtualRegister(RC);
2783 Entry->addLiveIn(*I);
2784 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2787 // Insert the copy-back instructions right before the terminator.
2789 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2790 TII->get(TargetOpcode::COPY), *I)
2799 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2806 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2808 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2809 DAG.getContext()->diagnose(NoGraphicsHSA);
2824 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2826 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2827 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2828 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2830 if (!Subtarget->enableFlatScratch())
2834 !Subtarget->hasArchitectedSGPRs())
2835 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2836 !Info->hasWorkGroupIDZ());
2847 // based on run-time states. Since we can't know what the final PSInputEna
2852 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2853 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2855 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2856 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2859 Info->markPSInputAllocated(0);
2860 Info->markPSInputEnabled(0);
2862 if (Subtarget->isAmdPalOS()) {
2864 // based on run-time states; the register values being generated here are
2871 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2874 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2877 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2888 if (IsKernel && Subtarget->hasKernargPreload())
2897 if (!Subtarget->enableFlatScratch())
2898 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2913 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2950 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2954 int64_t OffsetDiff = Offset - AlignDownOffset;
2961 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2981 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2990 TRI->getRegSizeInBits(*RC)));
3009 // If the argument was preloaded to multiple consecutive 32-bit
3013 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3034 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3035 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3036 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3037 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3039 // less than 16-bits. On CI and newer they could potentially be
3078 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3083 // If this is an 8 or 16-bit value, it is really passed promoted
3119 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3120 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3124 Info->setBytesInStackArgArea(StackArgSize);
3149 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3174 Info->setIfReturnsVoid(Outs.empty());
3175 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3177 // CCValAssign - represent the assignment of the return value to a location.
3181 // CCState - Info about the registers and stack slots.
3226 if (!Info->isEntryFunction()) {
3227 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3229 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3326 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3331 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3336 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3346 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3347 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3348 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3349 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3350 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3351 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3352 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3353 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3364 if (CLI.CB->hasFnAttr(Attr.second))
3368 CalleeArgInfo->getPreloadedValue(InputID);
3380 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3403 if (OutgoingArg->isRegister()) {
3404 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3405 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3423 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3426 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3429 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3443 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3444 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3445 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3448 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3450 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3457 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3458 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3466 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3467 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3493 if (OutgoingArg->isRegister()) {
3495 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3497 CCInfo.AllocateReg(OutgoingArg->getRegister());
3536 if (Callee->isDivergent())
3542 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3543 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3577 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3578 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3595 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3603 if (!CI->isTailCall())
3606 const Function *ParentFn = CI->getParent()->getParent();
3607 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3628 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3635 if (RequestedExec.Ty->isIntegerTy(64)) {
3682 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3727 // arguments to begin at SP+0. Completely unused for non-tail calls.
3737 if (!Subtarget->enableFlatScratch()) {
3742 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3805 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3823 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3828 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3853 // Build a sequence of copy-to-reg nodes chained together with token chain
3863 // We don't usually want to end the call-sequence here because we would tidy
3864 // the frame up *after* the call, however in the ABI-changing tail-call case
3878 const GlobalValue *GV = GSD->getGlobal();
3901 // Add a register mask operand representing the call-preserved registers.
3902 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3903 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3971 Register SPReg = Info->getStackPtrOffsetReg();
3980 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3981 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3983 TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3988 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3990 Align StackAlign = TFL->getStackAlign();
3994 DAG.getConstant(-(uint64_t)Alignment->value()
3995 << Subtarget->getWavefrontSizeLog2(),
4007 // We only handle constant sizes here to allow non-entry block, static sized
4026 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4047 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4061 // [1:0] Single-precision round mode.
4062 // [3:2] Double/Half-precision round mode.
4064 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4067 // Toward-0 3 0
4070 // -Inf 2 3
4073 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4085 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4095 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4114 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4118 static_cast<uint32_t>(ConstMode->getZExtValue()),
4124 // the range 0-3, we can use a simplified mapping to hardware values.
4127 // The supported standard values are 0-3. The extended values start at 8. We
4131 // Truncate to the low 32-bits.
4145 // table_index = umin(value, value - 4)
4186 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4193 if (Op->isDivergent())
4196 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4313 if (!Subtarget->hasFlatScrRegister() &&
4314 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4346 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4347 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4348 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4364 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4365 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4369 MF->insert(MBBI, LoopBB);
4370 MF->insert(MBBI, RemainderBB);
4372 LoopBB->addSuccessor(LoopBB);
4373 LoopBB->addSuccessor(RemainderBB);
4376 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4382 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4385 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4387 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4398 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4402 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4414 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4418 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4421 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4422 Src->setIsKill(false);
4426 MachineBasicBlock::iterator I = LoopBB->end();
4432 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4441 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4445 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4448 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4468 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4472 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4478 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4484 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4490 // Read the next variant <- also loop target.
4491 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4495 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4500 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4512 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4519 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4522 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4531 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4536 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4540 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4543 return InsertPt->getIterator();
4546 // This has slightly sub-optimal regalloc when the source vector is killed by
4548 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
4556 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4558 MachineRegisterInfo &MRI = MF->getRegInfo();
4562 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4569 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4572 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4579 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4585 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4588 MF->insert(MBBI, LandingPad);
4589 LoopBB->removeSuccessor(RemainderBB);
4590 LandingPad->addSuccessor(RemainderBB);
4591 LoopBB->addSuccessor(LandingPad);
4592 MachineBasicBlock::iterator First = LandingPad->begin();
4593 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4622 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4624 assert(Idx->getReg() != AMDGPU::NoRegister);
4627 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4629 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4642 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4645 return Idx->getReg();
4648 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4658 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4660 MachineRegisterInfo &MRI = MF->getRegInfo();
4663 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4664 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4665 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4668 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4677 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4688 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4696 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4713 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4719 MachineBasicBlock *LoopBB = InsPt->getParent();
4723 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4730 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4744 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4746 MachineRegisterInfo &MRI = MF->getRegInfo();
4749 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4750 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4751 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4752 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4753 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4754 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4757 assert(Val->getReg());
4761 SrcVec->getReg(),
4765 if (Idx->getReg() == AMDGPU::NoRegister) {
4771 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4781 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4789 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4791 .addReg(SrcVec->getReg())
4798 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4801 .addReg(SrcVec->getReg())
4810 if (Val->isReg())
4811 MRI.clearKillFlags(Val->getReg());
4818 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4820 MachineBasicBlock *LoopBB = InsPt->getParent();
4824 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4832 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4848 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
4855 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4861 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4881 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4902 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4903 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4905 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4908 I = ComputeLoop->end();
4910 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4914 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4915 .addReg(TmpSReg->getOperand(0).getReg())
4920 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4921 .addReg(ActiveBits->getOperand(0).getReg());
4923 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4925 .addReg(FF1->getOperand(0).getReg());
4926 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4927 .addReg(Accumulator->getOperand(0).getReg())
4928 .addReg(LaneValue->getOperand(0).getReg());
4934 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4935 .addReg(FF1->getOperand(0).getReg())
4936 .addReg(ActiveBits->getOperand(0).getReg());
4939 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4941 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4946 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4947 .addReg(NewActiveBits->getOperand(0).getReg())
4949 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4961 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4962 MachineFunction *MF = BB->getParent();
4963 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
4981 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4983 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4992 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4994 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4995 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5001 if (Subtarget->hasScalarAddSub64()) {
5003 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5008 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5013 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5015 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5018 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5020 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5025 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5028 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5031 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5042 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5043 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5054 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5059 TII->legalizeOperands(*Add);
5064 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5080 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5082 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5084 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5086 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5089 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5091 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5095 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5103 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5110 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5115 TII->legalizeOperands(*LoHalf);
5116 TII->legalizeOperands(*HiHalf);
5125 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5126 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5138 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5140 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5144 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5146 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5151 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5152 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5158 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5163 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5168 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5169 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5171 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5175 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5179 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5184 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5189 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5194 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5195 .addImm(-1)
5203 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5212 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5214 .addImm(MFI->getLDSSize());
5219 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5220 MachineRegisterInfo &MRI = MF->getRegInfo();
5234 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5237 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5240 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5242 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5246 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5249 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5284 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5285 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5296 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5307 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5309 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5311 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5313 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5316 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5318 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5321 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5323 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5336 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5345 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5347 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5349 Br->getOperand(1).setIsUndef(); // read undef SCC
5355 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5357 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5358 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5362 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5365 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5368 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5385 if (TII->pseudoToMCOpcode(Opc) == -1) {
5390 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5391 if (TII->isVOP3(*I)) {
5392 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5394 I.addReg(TRI->getVCC(), RegState::Define);
5401 TII->legalizeOperands(*I);
5411 TII->legalizeOperands(MI);
5416 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5422 if (getSubtarget()->hasGWSAutoReplay()) {
5446 if (getSubtarget()->hasDenormModeInst()) {
5465 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5467 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5468 unsigned ImmVal = Def->getOperand(1).getImm();
5470 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5478 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5491 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5499 MI.setDesc(TII->get(AMDGPU::COPY));
5503 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5504 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5513 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5514 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5515 MF->push_back(TrapBB);
5516 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5518 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5521 BB->addSuccessor(TrapBB);
5526 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5527 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5529 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5534 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5571 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5598 if (!Subtarget->hasMadMacF32Insts())
5599 return Subtarget->hasFastFMAF32();
5605 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5608 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5613 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5642 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5644 return Subtarget->hasMadMacF32Insts() &&
5654 EVT VT = N->getValueType(0);
5656 return Subtarget->hasMadMacF32Insts() &&
5659 return Subtarget->hasMadF16() &&
5666 //===----------------------------------------------------------------------===//
5668 //===----------------------------------------------------------------------===//
5681 SDValue Lo, Hi;
5682 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5685 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5686 Op->getFlags());
5688 Op->getFlags());
5712 Op->getFlags());
5714 Op->getFlags());
5744 Op->getFlags());
5746 Op->getFlags());
5760 Result.getNode()->getNumValues() == 2) &&
5808 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5820 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5942 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5943 EVT LoadVT = M->getValueType(0);
5964 VTList, Ops, M->getMemoryVT(),
5965 M->getMemOperand());
5976 EVT LoadVT = M->getValueType(0);
5982 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5983 bool IsTFE = M->getNumValues() == 3;
5996 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6000 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6001 M->getMemOperand(), DAG);
6007 M->getMemOperand(), DAG);
6015 EVT VT = N->getValueType(0);
6016 unsigned CondCode = N->getConstantOperandVal(3);
6022 SDValue LHS = N->getOperand(1);
6023 SDValue RHS = N->getOperand(2);
6037 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6049 EVT VT = N->getValueType(0);
6051 unsigned CondCode = N->getConstantOperandVal(3);
6055 SDValue Src0 = N->getOperand(1);
6056 SDValue Src1 = N->getOperand(2);
6067 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6078 EVT VT = N->getValueType(0);
6079 SDValue Src = N->getOperand(1);
6083 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6088 // (ballot 0) -> 0
6089 if (Arg->isZero())
6092 // (ballot 1) -> EXEC/EXEC_LO
6093 if (Arg->isOne()) {
6106 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6115 EVT VT = N->getValueType(0);
6117 unsigned IID = N->getConstantOperandVal(0);
6124 SDValue Src2, MVT ValT) -> SDValue {
6129 Operands.push_back(N->getOperand(6));
6130 Operands.push_back(N->getOperand(5));
6131 Operands.push_back(N->getOperand(4));
6150 if (SDNode *GL = N->getGluedNode()) {
6151 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6152 GL = GL->getOperand(0).getNode();
6160 SDValue Src0 = N->getOperand(1);
6164 Src1 = N->getOperand(2);
6166 Src2 = N->getOperand(3);
6197 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6198 EVT VT = N->getValueType(0);
6202 unsigned NumOperands = N->getNumOperands();
6204 SDNode *GL = N->getGluedNode();
6207 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6210 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6212 SDValue Operand = N->getOperand(j);
6226 Operands[NumOperands - 1] =
6228 SDValue(GL->getOperand(0).getNode(), 0));
6230 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6294 switch (N->getOpcode()) {
6306 unsigned IID = N->getConstantOperandVal(0);
6312 SDValue Src0 = N->getOperand(1);
6313 SDValue Src1 = N->getOperand(2);
6324 SDValue Src0 = N->getOperand(1);
6325 SDValue Src1 = N->getOperand(2);
6338 EVT VT = N->getValueType(0);
6353 if (!Subtarget->hasScalarSubwordLoads())
6360 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6372 if (!Offset->isDivergent()) {
6417 EVT VT = N->getValueType(0);
6419 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6420 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6430 N->getOperand(0), LHS, RHS);
6438 if (N->getValueType(0) != MVT::v2f16)
6442 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6451 if (N->getValueType(0) != MVT::v2f16)
6455 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6464 if (N->getValueType(0) != MVT::f16)
6479 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6485 if (I->getOpcode() == Opcode)
6492 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6493 switch (Intr->getConstantOperandVal(1)) {
6514 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6515 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6520 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6525 return (GV->getValueType()->isFunctionTy() ||
6526 !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
6535 if (!GV->hasExternalLinkage())
6553 if (Intr->getOpcode() == ISD::SETCC) {
6556 Intr = SetCC->getOperand(0).getNode();
6562 Target = BR->getOperand(1);
6571 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6572 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6575 (SetCC->getConstantOperandVal(1) == 1 &&
6576 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6584 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6587 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6604 BR->getOperand(0),
6607 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6611 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6614 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6621 CopyToReg->getOperand(1),
6622 SDValue(Result, i - 1),
6625 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6630 SDValue(Intr, Intr->getNumValues() - 1),
6631 Intr->getOperand(0));
6647 if (Info->isEntryFunction())
6654 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
6655 // Get the return address reg and mark it as an implicit live-in
6656 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6673 "Do not know how to custom lower FP_ROUND for non-f16 type");
6696 bool IsIEEEMode = Info->getMode().IEEE;
6699 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6759 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6763 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6767 // operands are zero-extended/sign-extended from 32-bits, then we split the
6768 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6769 // possible to check if the operands are zero-extended or sign-extended in
6771 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6772 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6780 if (Op->isDivergent())
6785 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6786 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6787 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6813 const APInt &C = RHSC->getAPIntValue();
6814 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6834 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6842 if (Op->isDivergent()) {
6846 if (Subtarget->hasSMulHi()) {
6857 if (!Subtarget->isTrapHandlerEnabled() ||
6858 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6861 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6897 Register UserSGPR = Info->getQueuePtrUserSGPR();
6901 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6931 if (Subtarget->hasPrivEnabledTrap2NopBug())
6947 if (!Subtarget->isTrapHandlerEnabled() ||
6948 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6968 if (Subtarget->hasApertureRegs()) {
6972 // Note: this feature (register) is broken. When used as a 32-bit operand,
7008 Register UserSGPR = Info->getQueuePtrUserSGPR();
7011 // amdgpu-no-queue-ptr. This is undefined.
7044 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7062 SrcAS = ASC->getSrcAddressSpace();
7063 Src = ASC->getOperand(0);
7064 DestAS = ASC->getDestAddressSpace();
7069 Src = Op->getOperand(1);
7070 SrcAS = Op->getConstantOperandVal(2);
7071 DestAS = Op->getConstantOperandVal(3);
7077 // flat -> local/private
7095 // local/private -> flat
7123 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7132 // global <-> flat are no-ops and never emitted.
7137 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7139 return DAG.getUNDEF(Op->getValueType(0));
7156 unsigned IdxVal = Idx->getAsZExtVal();
7160 // Insert 32-bit registers at a time.
7222 unsigned Idx = KIdx->getZExtValue();
7227 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7250 // Convert vector index to bit-index and get the required bit mask.
7292 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7297 SDValue Lo, Hi;
7303 Lo = DAG.getBitcast(LoVT,
7317 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7331 Lo = DAG.getBitcast(LoVT,
7342 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7344 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7365 // Convert vector index to bit-index (* EltSize)
7395 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7398 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7401 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7406 if (elementPairIsContiguous(SVN->getMask(), I)) {
7407 const int Idx = SVN->getMaskElt(I);
7409 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7411 PackVT, SVN->getOperand(VecIdx),
7415 const int Idx0 = SVN->getMaskElt(I);
7416 const int Idx1 = SVN->getMaskElt(I + 1);
7419 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7420 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7422 SDValue Vec0 = SVN->getOperand(VecIdx0);
7426 SDValue Vec1 = SVN->getOperand(VecIdx1);
7470 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7473 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7524 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7526 SDValue Lo = Op.getOperand(0);
7531 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7532 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7541 if (Lo.isUndef())
7544 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7545 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7547 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7554 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7555 // which can create arbitrary 64-bit addends. (This is only a problem for
7561 if (!Subtarget->isAmdHsaOS())
7565 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7566 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7567 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7568 !shouldEmitGOTReloc(GA->getGlobal());
7575 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7576 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7586 // constant, which is a pc-relative offset from the encoding of the $symbol
7591 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7595 // fixups or relocations are emitted to replace $symbol@*@lo and
7597 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7615 const GlobalValue *GV = GSD->getGlobal();
7616 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7618 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
7619 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
7620 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7621 GV->hasExternalLinkage()) {
7622 Type *Ty = GV->getValueType();
7624 // zero-sized type in other languages to declare the dynamic shared
7629 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7632 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7633 MFI->setUsesDynamicLDS(true);
7641 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
7642 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7647 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7649 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7653 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7660 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7663 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7704 // The local size values will have the hi 16-bits as zero.
7712 "non-hsa intrinsic with hsa target",
7714 DAG.getContext()->diagnose(BadIntrin);
7723 DAG.getContext()->diagnose(BadIntrin);
7768 while (ExtraElts--)
7774 // Re-construct the required return value for a image load intrinsic.
7813 NumDataDwords - MaskPopDwords);
7843 if (Result->getNumValues() == 1)
7853 uint64_t Value = TexFailCtrlConst->getZExtValue();
7861 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7862 Value &= ~(uint64_t)0x2;
7877 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7882 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7883 I == DimIdx + NumGradients - 1))) {
7903 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
7904 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7905 unsigned IntrOpcode = Intr->BaseOpcode;
7910 SmallVector<EVT, 3> ResultTypes(Op->values());
7911 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7926 if (BaseOpcode->Atomic) {
7930 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7931 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7934 if (BaseOpcode->AtomicX2) {
7949 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7950 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7952 if (BaseOpcode->Store) {
7957 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7965 } else if (!BaseOpcode->NoReturn) {
7970 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7984 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7985 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7994 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7999 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8004 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8010 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8012 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8014 // occupies full 32-bit.
8020 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8026 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8036 if (!ST->hasA16()) {
8048 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8051 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
8052 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8058 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8060 ArgOffset + Intr->GradientStart,
8061 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8063 for (unsigned I = ArgOffset + Intr->GradientStart;
8064 I < ArgOffset + Intr->CoordStart; I++)
8071 ArgOffset + Intr->CoordStart, VAddrEnd,
8075 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8080 // without introducing moves, then using the non-sequential address encoding
8086 // so force non-NSA for the common 2-address case as a heuristic.
8088 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8093 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8094 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8095 const bool UseNSA = ST->hasNSAEncoding() &&
8096 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8104 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8113 if (!BaseOpcode->Sampler) {
8117 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8124 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8131 // Expecting to get an error flag since TFC is on - and dmask is 0
8145 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8146 // This is a no-op load. This can be eliminated
8166 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8167 if (BaseOpcode->Atomic)
8168 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8174 if (BaseOpcode->Store || BaseOpcode->Atomic)
8177 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8184 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
8185 if (BaseOpcode->Sampler)
8186 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
8189 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8190 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8194 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8197 if (!Subtarget->hasGFX90AInsts()) {
8199 } else if (TFE->getAsZExtVal()) {
8202 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8205 Ops.push_back(DimInfo->DA ? True : False);
8206 if (BaseOpcode->HasD16)
8213 int Opcode = -1;
8229 if (Subtarget->hasGFX90AInsts()) {
8232 if (Opcode == -1)
8236 if (Opcode == -1 &&
8237 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8240 if (Opcode == -1)
8244 if (Opcode == -1)
8249 MachineMemOperand *MemRef = MemOp->getMemOperand();
8253 if (BaseOpcode->AtomicX2) {
8258 if (BaseOpcode->NoReturn)
8261 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8280 if (!Offset->isDivergent()) {
8287 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8296 !Subtarget->hasScalarDwordx3Loads()) {
8323 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8347 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8362 if (!Subtarget->hasArchitectedSGPRs())
8376 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8405 // TODO: Should this propagate fast-math-flags?
8409 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8416 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8420 DAG.getContext()->diagnose(BadIntrin);
8429 if (MFI->isEntryFunction())
8451 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8455 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8459 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8463 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
8464 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
8473 if (Subtarget->isAmdHsaOS())
8480 if (Subtarget->isAmdHsaOS())
8487 if (Subtarget->isAmdHsaOS())
8494 if (Subtarget->isAmdHsaOS())
8501 if (Subtarget->isAmdHsaOS())
8508 if (Subtarget->isAmdHsaOS())
8515 if (Subtarget->isAmdHsaOS())
8521 if (Subtarget->isAmdHsaOS())
8527 if (Subtarget->isAmdHsaOS())
8544 if (MFI->isEntryFunction())
8550 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8552 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8554 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8562 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8583 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8616 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8618 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8622 // There is a Pat that handles this variant, so return it as-is.
8685 Op->getOperand(1), Op->getOperand(2)), 0);
8694 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8717 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8718 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8720 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8777 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8806 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8807 M->getMemOperand());
8834 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8835 M->getMemOperand());
8847 SDValue Chain = M->getOperand(0);
8848 SDValue M0 = M->getOperand(2);
8849 SDValue Value = M->getOperand(3);
8850 unsigned IndexOperand = M->getConstantOperandVal(7);
8851 unsigned WaveRelease = M->getConstantOperandVal(8);
8852 unsigned WaveDone = M->getConstantOperandVal(9);
8858 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8880 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8881 Offset1 |= (CountDw - 1) << 6;
8883 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8895 M->getVTList(), Ops, M->getMemoryVT(),
8896 M->getMemOperand());
8973 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9000 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9126 Op->getVTList(), Ops, VT, M->getMemOperand());
9130 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9149 Op->getVTList(), Ops, VT, M->getMemOperand());
9153 SDValue NodePtr = M->getOperand(2);
9154 SDValue RayExtent = M->getOperand(3);
9155 SDValue RayOrigin = M->getOperand(4);
9156 SDValue RayDir = M->getOperand(5);
9157 SDValue RayInvDir = M->getOperand(6);
9158 SDValue TDescr = M->getOperand(7);
9165 if (!Subtarget->hasGFX10_AEncoding()) {
9178 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9179 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9199 assert(Opcode != -1);
9265 Ops.append(16 - Ops.size(), Undef);
9276 Ops.push_back(M->getChain());
9278 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9279 MachineMemOperand *MemRef = M->getMemOperand();
9293 M->getOperand(0), // Chain
9294 M->getOperand(2), // Ptr
9295 M->getOperand(3) // Value
9316 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9317 Ops, M->getMemOperand());
9320 SDValue Chain = Op->getOperand(0);
9326 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9327 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9341 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9389 if (!Subtarget->hasDwordx3LoadStores() &&
9416 if (Subtarget->hasUnpackedD16VMem()) {
9430 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9491 if (!Subtarget->hasCompressedExport()) {
9495 DAG.getContext()->diagnose(BadIntrin);
9513 Op.getOperand(3), // en
9517 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9518 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9571 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9572 M->getMemoryVT(), M->getMemOperand());
9599 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9600 M->getMemoryVT(), M->getMemOperand());
9649 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9650 M->getMemoryVT(), M->getMemOperand());
9701 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9702 M->getMemoryVT(), M->getMemOperand());
9716 unsigned Size = Op->getConstantOperandVal(4);
9767 MachineMemOperand *LoadMMO = M->getMemOperand();
9770 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9778 auto F = LoadMMO->getFlags() &
9782 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9786 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9788 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9795 unsigned Size = Op->getConstantOperandVal(4);
9819 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9823 if (LHS->isDivergent())
9826 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9835 if (!Addr->isDivergent()) {
9849 MachineMemOperand *LoadMMO = M->getMemOperand();
9850 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9851 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9857 auto F = LoadMMO->getFlags() &
9861 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9864 LoadMMO->getAAInfo());
9866 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9873 Op->getOperand(2), Chain), 0);
9877 SDValue Chain = Op->getOperand(0);
9879 SDValue BarOp = Op->getOperand(2);
9885 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9941 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9975 unsigned ImmOffset = C1->getZExtValue();
9985 ImmOffset -= Overflow;
10014 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10017 uint32_t Imm = C->getZExtValue();
10019 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10030 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10032 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10040 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10066 SDValue Pointer = Op->getOperand(1);
10067 SDValue Stride = Op->getOperand(2);
10068 SDValue NumRecords = Op->getOperand(3);
10069 SDValue Flags = Op->getOperand(4);
10076 ConstStride = ConstNode->getZExtValue();
10148 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10149 M->getMemOperand());
10172 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10173 // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10176 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10180 unsigned AS = Ld->getAddressSpace();
10183 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10188 // pre-legalize.
10189 EVT MemVT = Ld->getMemoryVT();
10196 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10200 SDValue Ptr = Ld->getBasePtr();
10202 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10203 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10204 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10209 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10215 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10218 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10219 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10222 assert(Ld->getExtensionType() == ISD::EXTLOAD);
10225 EVT VT = Ld->getValueType(0);
10230 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10231 // the appropriate extension from the 32-bit load.
10232 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10252 ISD::LoadExtType ExtType = Load->getExtensionType();
10253 EVT MemVT = Load->getMemoryVT();
10262 SDValue Chain = Load->getChain();
10263 SDValue BasePtr = Load->getBasePtr();
10264 MachineMemOperand *MMO = Load->getMemOperand();
10300 "Custom lowering for non-i32 vectors hasn't been implemented.");
10302 Align Alignment = Load->getAlign();
10303 unsigned AS = Load->getAddressSpace();
10304 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10314 !Subtarget->hasMultiDwordFlatScratchAddressing())
10315 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10322 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10324 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10328 // Non-uniform loads will be selected to MUBUF instructions, so they
10337 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10338 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10341 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10345 // Non-uniform loads will be selected to MUBUF instructions, so they
10357 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10367 switch (Subtarget->getMaxPrivateElementSize()) {
10382 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10391 auto Flags = Load->getMemOperand()->getFlags();
10393 Load->getAlign(), Flags, &Fast) &&
10402 MemVT, *Load->getMemOperand())) {
10431 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10438 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10450 const SDNodeFlags Flags = Op->getFlags();
10462 if (CLHS->isExactlyValue(1.0)) {
10470 // 1.0 / sqrt(x) -> rsq(x)
10472 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10474 // 1.0 / x -> rcp(x)
10479 if (CLHS->isExactlyValue(-1.0)) {
10480 // -1.0 / x -> rcp (fneg x)
10492 // x / y -> x * (1.0 / y)
10503 const SDNodeFlags Flags = Op->getFlags();
10527 if (GlueChain->getNumValues() <= 1) {
10531 assert(GlueChain->getNumValues() == 3);
10549 if (GlueChain->getNumValues() <= 1) {
10553 assert(GlueChain->getNumValues() == 3);
10590 SDNodeFlags Flags = Op->getFlags();
10600 const APFloat K1Val(0x1p-32f);
10627 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10628 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10641 SDNodeFlags Flags = Op->getFlags();
10669 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10679 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10697 if (Subtarget->hasDenormModeInst()) {
10740 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10805 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10857 EVT ResultExpVT = Op->getValueType(1);
10868 if (Subtarget->hasFractBug()) {
10886 EVT VT = Store->getMemoryVT();
10889 return DAG.getTruncStore(Store->getChain(), DL,
10890 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10891 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10895 Store->getValue().getValueType().getScalarType() == MVT::i32);
10897 unsigned AS = Store->getAddressSpace();
10898 if (Subtarget->hasLDSMisalignedBug() &&
10900 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10909 !Subtarget->hasMultiDwordFlatScratchAddressing())
10910 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10919 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10923 VT, *Store->getMemOperand()))
10929 switch (Subtarget->getMaxPrivateElementSize()) {
10938 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10946 auto Flags = Store->getMemOperand()->getFlags();
10948 Store->getAlign(), Flags, &Fast) &&
10965 assert(!Subtarget->has16BitInsts());
10966 SDNodeFlags Flags = Op->getFlags();
10980 SDNodeFlags Flags = Op->getFlags();
10991 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11009 DAG.getConstant(-1, DL, MVT::i32));
11054 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11075 // r0 = 0.5 - h0 * g0
11079 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11083 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11088 SDNodeFlags Flags = Op->getFlags();
11093 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11130 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11136 // with finite only or nsz because rsq(+/-0) = +/-inf
11143 // If x is +INF, +0, or -0, use its original value
11154 // Propagate fast-math flags so that the multiply we introduce can be folded
11156 auto Flags = Op->getFlags();
11160 if (Subtarget->hasTrigReducedRange()) {
11179 assert(AtomicNode->isCompareAndSwap());
11180 unsigned AS = AtomicNode->getAddressSpace();
11186 // Non-local address space requires custom lowering for atomic compare
11200 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11201 Ops, VT, AtomicNode->getMemOperand());
11204 //===----------------------------------------------------------------------===//
11206 //===----------------------------------------------------------------------===//
11210 EVT VT = N->getValueType(0);
11218 SDValue Src = N->getOperand(0);
11223 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11244 SDValue MagnitudeOp = N->getOperand(0);
11245 SDValue SignOp = N->getOperand(1);
11251 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11274 // fcopysign f64:x, f64:y ->
11282 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11286 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11287 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11291 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11306 SDValue N0 = N->getOperand(0);
11307 SDValue N1 = N->getOperand(1);
11312 N0->hasOneUse())
11325 if (N0->getOpcode() == ISD::OR &&
11331 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11341 EVT VT = N->getValueType(0);
11347 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11349 N0->getFlags().hasNoUnsignedWrap()));
11358 switch (N->getOpcode()) {
11374 SDValue Ptr = N->getOperand(PtrIdx);
11378 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11379 N->getMemoryVT(), DCI);
11381 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11397 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11398 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11399 // integer combine opportunities since most 64-bit operations are decomposed
11407 uint64_t Val = CRHS->getZExtValue();
11410 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11414 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11415 // If we need to materialize a 64-bit immediate, it will be split up later
11416 // anyway. Avoid creating the harder to understand 64-bit immediate
11450 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11458 // or -1 if not succeeded.
11460 // value 0-3 selects corresponding source byte;
11473 uint32_t C = N1->getZExtValue();
11510 EVT VT = N->getValueType(0);
11511 SDValue LHS = N->getOperand(0);
11512 SDValue RHS = N->getOperand(1);
11527 uint64_t Mask = CRHS->getZExtValue();
11529 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11531 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11532 unsigned Shift = CShift->getZExtValue();
11533 unsigned NB = CRHS->getAPIntValue().countr_zero();
11535 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11538 LHS->getOperand(0),
11551 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11566 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11569 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11570 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11584 if (!C1 || !C1->isInfinity() || C1->isNegative())
11612 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11613 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11614 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11621 Mask->getZExtValue() & ~OrdMask :
11622 Mask->getZExtValue() & OrdMask;
11640 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11641 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11643 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11655 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11664 // Each byte in each mask is either selector mask 0-3, or has higher
11714 // trunc* 255 srl -256
11718 // *In this example, the truncs are from i32->i16
11720 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11721 // respectively. calculateSrcByte would find (given node) -> ultimate src &
11722 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11744 switch (Op->getOpcode()) {
11746 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11752 SDValue NarrowOp = Op->getOperand(0);
11754 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11755 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11756 NarrowVT = VTSign->getVT();
11764 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11769 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11773 uint64_t BitShift = ShiftOp->getZExtValue();
11780 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11807 if (Index > BitWidth / 8 - 1)
11826 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11828 if (!LHS || LHS->isConstantZero())
11830 if (!RHS || RHS->isConstantZero())
11839 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11843 uint32_t BitMask = BitMaskOp->getZExtValue();
11855 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11862 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11863 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11871 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11890 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11894 uint64_t BitShift = ShiftOp->getZExtValue();
11904 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11908 return BytesProvided - ByteShift > Index
11909 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11918 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11922 uint64_t BitShift = ShiftOp->getZExtValue();
11930 // of interest is Index - ByteShift of the src
11933 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11945 SDValue NarrowOp = Op->getOperand(0);
11947 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11948 Op->getOpcode() == ISD::AssertZext ||
11949 Op->getOpcode() == ISD::AssertSext) {
11950 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11951 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11989 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11998 return L->getExtensionType() == ISD::ZEXTLOAD
12015 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12020 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12023 auto VecIdx = IdxOp->getZExtValue();
12035 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12040 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12072 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12075 auto MemVT = L->getMemoryVT();
12078 return L->getMemoryVT().getSizeInBits() == 16;
12093 bool IsConsecutive = (Hi8 - Low8 == 1);
12164 : NumElements - NormalizedTrunc;
12185 [[maybe_unused]] EVT VT = N->getValueType(0);
12195 if (!P || P->isConstantZero())
12217 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12218 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12223 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12248 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12251 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12261 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12264 // ANY_EXTEND as the extended bits are dont-cares.
12277 SDValue LHS = N->getOperand(0);
12278 SDValue RHS = N->getOperand(1);
12280 EVT VT = N->getValueType(0);
12282 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12297 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12306 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12310 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12320 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12321 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12323 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12328 // If we have any non-vectorized use, then it is a candidate for v_perm
12329 if (OrUse->getOpcode() != ISD::BITCAST ||
12330 !OrUse->getValueType(0).isVector())
12333 // If we have any non-vectorized use, then it is a candidate for v_perm
12334 for (auto VUse : OrUse->uses()) {
12335 if (!VUse->getValueType(0).isVector())
12340 // TODO -- whitelist more uses
12342 if (VUse->getOpcode() == VectorwiseOp)
12348 if (!any_of(N->uses(), usesCombinedOperand))
12363 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12398 // (or i64:x, (zero_extend i32:y)) ->
12422 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12426 N->getOperand(0), CRHS))
12438 SDValue LHS = N->getOperand(0);
12439 SDValue RHS = N->getOperand(1);
12444 EVT VT = N->getValueType(0);
12451 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12452 // fneg-like xors into 64-bit select.
12455 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12457 // xor (select c, a, b), 0x80000000 ->
12461 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12463 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12467 LHS->getOperand(0), FNegLHS, FNegRHS);
12477 if (!Subtarget->has16BitInsts() ||
12481 EVT VT = N->getValueType(0);
12485 SDValue Src = N->getOperand(0);
12495 SDValue Src = N->getOperand(0);
12496 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12501 VTSign->getVT() == MVT::i8) ||
12503 VTSign->getVT() == MVT::i16))) {
12504 assert(Subtarget->hasScalarSubwordLoads() &&
12520 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12525 VTSign->getVT() == MVT::i8) ||
12527 VTSign->getVT() == MVT::i16)) &&
12547 Ops, M->getMemoryVT(),
12548 M->getMemOperand());
12558 SDValue Mask = N->getOperand(1);
12560 // fp_class x, 0 -> false
12564 if (N->getOperand(0).isUndef())
12572 EVT VT = N->getValueType(0);
12573 SDValue N0 = N->getOperand(0);
12584 N->getFlags());
12589 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12591 N0.getOperand(0), N->getFlags());
12604 const auto &F = CFP->getValueAPF();
12667 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12676 if (RHS->getZExtValue() == 0xffff0000) {
12677 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12705 if (Subtarget->supportsMinMaxDenormModes() ||
12711 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12716 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12723 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12724 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12729 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12737 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12740 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12741 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12751 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12759 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12801 unsigned Opcode = MI->getOpcode();
12809 if (FCR->Value.isSignaling())
12811 if (!FCR->Value.isDenormal())
12814 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12852 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12859 if (Subtarget->supportsMinMaxDenormModes() ||
12867 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12868 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12873 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12939 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12957 SDValue N0 = N->getOperand(0);
12958 EVT VT = N->getValueType(0);
12960 // fcanonicalize undef -> qnan
12967 EVT VT = N->getValueType(0);
12968 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12971 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12974 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12982 SDValue Lo = N0.getOperand(0);
12984 EVT EltVT = Lo.getValueType();
12986 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
12991 CFP->getValueAPF());
13065 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13068 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13072 EVT VT = MinK->getValueType(0);
13074 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13080 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13089 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13109 if (K0->getValueAPF() > K1->getValueAPF())
13117 if (Info->getMode().DX10Clamp) {
13120 // FIXME: Should this be allowing -0.0?
13121 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13126 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13135 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13137 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13138 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13139 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13178 EVT VT = N->getValueType(0);
13179 unsigned Opc = N->getOpcode();
13180 SDValue Op0 = N->getOperand(0);
13181 SDValue Op1 = N->getOperand(1);
13187 // max(max(a, b), c) -> max3(a, b, c)
13188 // min(min(a, b), c) -> min3(a, b, c)
13193 N->getValueType(0),
13200 // max(a, max(b, c)) -> max3(a, b, c)
13201 // min(a, min(b, c)) -> min3(a, b, c)
13206 N->getValueType(0),
13213 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13214 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13217 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13222 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13228 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13233 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13237 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13243 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13244 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13256 // FIXME: Should this be allowing -0.0?
13257 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13258 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13268 EVT VT = N->getValueType(0);
13275 SDValue Src0 = N->getOperand(0);
13276 SDValue Src1 = N->getOperand(1);
13277 SDValue Src2 = N->getOperand(2);
13280 // const_a, const_b, x -> clamp is safe in all cases including signaling
13282 // FIXME: Should this be allowing -0.0?
13290 // handling no dx10-clamp?
13291 if (Info->getMode().DX10Clamp) {
13312 SDValue Src0 = N->getOperand(0);
13313 SDValue Src1 = N->getOperand(1);
13315 return DCI.DAG.getUNDEF(N->getValueType(0));
13319 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13330 // Sub-dword vectors of size 2 dword or less have better implementation.
13334 // Always expand the rest of sub-dword instructions, otherwise it will be
13339 // Always do this if var-idx is divergent, otherwise it will become a loop.
13349 if (!Subtarget->hasMovrel())
13358 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13362 SDValue Vec = N->getOperand(0);
13369 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13374 SDValue Vec = N->getOperand(0);
13379 EVT ResVT = N->getValueType(0);
13387 SDValue Idx = N->getOperand(1);
13393 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13397 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13400 SDValue Idx = N->getOperand(1);
13428 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13433 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13436 SDValue Idx = N->getOperand(1);
13452 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13454 // multiple small extract_vector_elements with a single 32-bit extract.
13455 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13460 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13493 SDValue Vec = N->getOperand(0);
13494 SDValue Idx = N->getOperand(2);
13498 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13499 // => BUILD_VECTOR n x select (e, const-idx)
13505 SDValue Ins = N->getOperand(1);
13528 APFloat Val = CFP->getValueAPF();
13540 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13543 SDValue TruncSrc = N->getOperand(0);
13544 EVT VT = N->getValueType(0);
13586 EVT VT = N0->getValueType(0);
13592 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13599 (N0->getFlags().hasAllowContract() &&
13600 N1->getFlags().hasAllowContract())) &&
13609 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13612 EVT VT = N->getValueType(0);
13619 unsigned Opc = N->getOpcode();
13620 SDValue Op0 = N->getOperand(0);
13621 SDValue Op1 = N->getOperand(1);
13623 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13626 if (Op0->isDivergent())
13634 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13637 if (Op1->isDivergent())
13655 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13658 // Full 64-bit multiplies that feed into an addition are lowered here instead
13665 assert(N->getOpcode() == ISD::ADD);
13668 EVT VT = N->getValueType(0);
13670 SDValue LHS = N->getOperand(0);
13671 SDValue RHS = N->getOperand(1);
13678 if (!N->isDivergent() && Subtarget->hasSMulHi())
13691 // multiple uses, except on hardware with full-rate multiply-add (which is
13692 // part of full-rate 64-bit ops).
13693 if (!Subtarget->hasFullRate64Ops()) {
13695 for (SDNode *Use : LHS->uses()) {
13698 if (Use->getOpcode() != ISD::ADD)
13737 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13738 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13739 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13742 // are {sign,zero}-extended or not.
13786 if (!Byte0 || Byte0->isConstantZero()) {
13790 if (Byte1 && !Byte1->isConstantZero()) {
13837 unsigned FMask = 0xFF << (8 * (3 - Step));
13840 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13842 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13846 int FirstGroup = -1;
13856 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13861 if (FirstGroup != -1) {
13869 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13880 unsigned FMask = 0xFF << (8 * (3 - Step));
13884 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13888 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13901 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13904 if (Elt->PermMask == 0x3020100)
13908 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13919 auto FirstMask = FirstElt->PermMask;
13920 auto SecondMask = SecondElt->PermMask;
13930 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13932 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13947 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13951 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13964 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13981 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14043 EVT VT = N->getValueType(0);
14045 SDValue LHS = N->getOperand(0);
14046 SDValue RHS = N->getOperand(1);
14049 if (Subtarget->hasMad64_32()) {
14059 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14060 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14070 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14071 if (MulIdx == -1)
14073 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14076 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14081 TempNode->getOperand(MulIdx), *Src0, *Src1,
14082 TempNode->getOperand(MulIdx)->getOperand(0),
14083 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14091 auto AddIdx = 1 - MulIdx;
14092 // Allow the special case where add (add (mul24, 0), mul24) became ->
14094 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14095 Src2s.push_back(TempNode->getOperand(AddIdx));
14097 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14101 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14105 TempNode->getOperand(AddIdx), *Src0, *Src1,
14106 TempNode->getOperand(AddIdx)->getOperand(0),
14107 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14119 TempNode = TempNode->getOperand(AddIdx);
14122 if (TempNode->getNumOperands() < 2)
14124 LHS = TempNode->getOperand(0);
14125 RHS = TempNode->getOperand(1);
14146 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14147 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14148 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14150 auto Src0Mask = Src0s.begin()->PermMask;
14154 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14168 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14171 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14172 SecondElt->DWordOffset);
14188 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14232 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14241 EVT VT = N->getValueType(0);
14247 SDValue LHS = N->getOperand(0);
14248 SDValue RHS = N->getOperand(1);
14275 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14283 if (N->getValueType(0) != MVT::i32)
14286 if (!isNullConstant(N->getOperand(1)))
14290 SDValue LHS = N->getOperand(0);
14295 unsigned Opc = N->getOpcode();
14298 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14299 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14310 EVT VT = N->getValueType(0);
14313 SDValue LHS = N->getOperand(0);
14314 SDValue RHS = N->getOperand(1);
14319 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14331 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14353 EVT VT = N->getValueType(0);
14361 SDValue LHS = N->getOperand(0);
14362 SDValue RHS = N->getOperand(1);
14364 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14378 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14384 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14397 EVT VT = N->getValueType(0);
14398 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14401 SDValue LHS = N->getOperand(0);
14402 SDValue RHS = N->getOperand(1);
14404 SDNodeFlags Flags = N->getFlags();
14405 SDNodeFlags RHSFlags = RHS->getFlags();
14407 !RHS->hasOneUse())
14412 if (CLHS->isExactlyValue(1.0) ||
14413 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14414 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14415 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14431 EVT VT = N->getValueType(0);
14434 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14437 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14439 SDValue Op1 = N->getOperand(0);
14440 SDValue Op2 = N->getOperand(1);
14441 SDValue FMA = N->getOperand(2);
14450 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14453 (N->getFlags().hasAllowContract() &&
14454 FMA->getFlags().hasAllowContract())) {
14508 SDValue LHS = N->getOperand(0);
14509 SDValue RHS = N->getOperand(1);
14511 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14525 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14526 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14527 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14529 if ((CRHS->isAllOnes() &&
14531 (CRHS->isZero() &&
14534 DAG.getConstant(-1, SL, MVT::i1));
14535 if ((CRHS->isAllOnes() &&
14537 (CRHS->isZero() &&
14542 const APInt &CRHSVal = CRHS->getAPIntValue();
14550 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14552 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14560 DAG.getConstant(-1, SL, MVT::i1));
14568 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14572 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14573 // (fcmp one (fabs x), inf) -> (fp_class x,
14580 const APFloat &APF = CRHS->getValueAPF();
14603 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14605 SDValue Src = N->getOperand(0);
14606 SDValue Shift = N->getOperand(0);
14613 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14614 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14615 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14616 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14617 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14624 ShiftOffset -= C->getZExtValue();
14626 ShiftOffset += C->getZExtValue();
14640 if (N->getOpcode() != ISD::DELETED_NODE)
14648 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14655 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14660 const APFloat &F = CSrc->getValueAPF();
14663 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14664 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14669 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14679 switch (N->getOpcode()) {
14715 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14716 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14717 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14741 SDValue Src = N->getOperand(0);
14764 EVT VT = N->getValueType(0);
14766 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14769 SDValue Src = N->getOperand(0);
14819 unsigned Opcode = Node->getMachineOpcode();
14822 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14823 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14828 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14829 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14831 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14832 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14833 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14834 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14838 bool HasChain = Node->getNumValues() > 1;
14852 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14860 if (!I->isMachineOpcode() ||
14861 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14868 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14899 // If the original dmask has one channel - then nothing to do
14902 // Use an arbitrary dmask - required for the instruction to work
14911 // Check for TFE or LWE - increase the number of channels by one to account
14918 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14919 assert(NewOpcode != -1 &&
14920 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14925 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14927 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14929 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14943 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14948 assert(Node->hasNUsesOfValue(1, 0));
14950 SDLoc(Node), Users[Lane]->getValueType(0),
14998 if (Node->getOpcode() == ISD::CopyToReg) {
14999 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15000 SDValue SrcVal = Node->getOperand(2);
15004 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15010 SDNode *Glued = Node->getGluedNode();
15012 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15013 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15024 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15025 if (!isFrameIndexOp(Node->getOperand(i))) {
15026 Ops.push_back(Node->getOperand(i));
15032 Node->getOperand(i).getValueType(),
15033 Node->getOperand(i)), 0));
15043 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15044 unsigned Opcode = Node->getMachineOpcode();
15046 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15047 !TII->isGather4(Opcode) &&
15064 SDValue Src0 = Node->getOperand(1);
15065 SDValue Src1 = Node->getOperand(3);
15066 SDValue Src2 = Node->getOperand(5);
15075 getRegClassFor(VT, Src0.getNode()->isDivergent());
15101 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15106 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15120 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15121 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15122 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15129 if (TII->isImage(MI)) {
15130 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15131 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15132 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15137 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15138 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15139 unsigned D16Val = D16 ? D16->getImm() : 0;
15144 // At least one of TFE or LWE are non-zero
15149 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15154 unsigned dmask = MO_Dmask->getImm();
15157 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15159 bool Packed = !Subtarget->hasUnpackedD16VMem();
15164 // - this is in fact an error but this is picked up elsewhere and
15167 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15170 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15171 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15185 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15186 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15188 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15189 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15190 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15193 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15195 // Insert into the super-reg
15196 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15208 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15215 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15217 MachineFunction *MF = MI.getParent()->getParent();
15218 MachineRegisterInfo &MRI = MF->getRegInfo();
15219 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
15221 if (TII->isVOP3(MI.getOpcode())) {
15223 TII->legalizeOperandsVOP3(MRI, MI);
15226 // This saves a chain-copy of registers and better balance register
15230 bool HasAGPRs = Info->mayNeedAGPRs();
15231 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15236 if (I == -1)
15243 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15244 if (!TRI->hasAGPRs(RC))
15247 if (!Src || !Src->isCopy() ||
15248 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15250 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15261 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15262 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15263 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15264 if (TRI->isVectorSuperClass(RC)) {
15265 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15266 MRI.setRegClass(Src2->getReg(), NewRC);
15267 if (Src2->isTied())
15277 if (TII->isImage(MI))
15278 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15290 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15293 // full 128-bit register. If we are building multiple resource descriptors,
15294 // this will allow CSEing of the 2-component register.
15299 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15352 //===----------------------------------------------------------------------===//
15354 //===----------------------------------------------------------------------===//
15390 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15397 if (!Subtarget->hasMAIInsts())
15404 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15419 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15437 uint32_t Width = (End - Idx + 1) * 32;
15438 MCRegister Reg = RC->getRegister(Idx);
15440 RC = TRI->getVGPRClassForBitWidth(Width);
15442 RC = TRI->getSGPRClassForBitWidth(Width);
15444 RC = TRI->getAGPRClassForBitWidth(Width);
15446 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15452 if (!Failed && Idx < RC->getNumRegs())
15453 return std::pair(RC->getRegister(Idx), RC);
15460 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15528 if (Size == 16 && !Subtarget->has16BitInsts())
15532 Val = C->getSExtValue();
15536 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15544 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15545 Val = C->getSExtValue();
15548 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15549 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15592 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15663 return -1;
15668 // the function is legalized do we know all of the non-spill stack objects or if
15674 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15677 if (Info->isEntryFunction()) {
15686 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15687 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15689 Info->setSGPRForEXECCopy(SReg);
15691 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15692 Info->getStackPtrOffsetReg()));
15693 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15694 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15698 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15699 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15701 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15702 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15704 Info->limitOccupancy(MF);
15709 TII->fixImplicitOperands(MI);
15716 // per-subtarget, but there's no easy way to achieve that right now. This is
15725 int NewClassID = getAlignedAGPRClassID(RC->getID());
15726 if (NewClassID != -1)
15727 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15749 // These return at most the (wavefront size - 1) + src1
15758 Known.Zero.setHighBits(Size - MaxActiveBits);
15776 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15790 switch (MI->getOpcode()) {
15793 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15805 // These return at most the wavefront size - 1.
15807 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15815 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15829 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15861 Intrinsic::ID IID = GI->getIntrinsicID();
15874 // Pre-GFX10 target did not benefit from loop alignment
15875 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15876 getSubtarget()->hasInstFwdPrefetchBug())
15889 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15890 const MachineBasicBlock *Header = ML->getHeader();
15891 if (Header->getAlignment() != PrefAlign)
15892 return Header->getAlignment(); // Already processed.
15895 for (const MachineBasicBlock *MBB : ML->blocks()) {
15899 LoopSize += MBB->getAlignment().value() / 2;
15902 LoopSize += TII->getInstSizeInBytes(MI);
15916 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15917 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15918 auto I = Exit->getFirstNonDebugInstr();
15919 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15924 MachineBasicBlock *Pre = ML->getLoopPreheader();
15925 MachineBasicBlock *Exit = ML->getExitBlock();
15928 auto PreTerm = Pre->getFirstTerminator();
15929 if (PreTerm == Pre->begin() ||
15930 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15931 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15934 auto ExitHead = Exit->getFirstNonDebugInstr();
15935 if (ExitHead == Exit->end() ||
15936 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15937 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15946 assert(N->getOpcode() == ISD::CopyFromReg);
15949 N = N->getOperand(0).getNode();
15950 if (N->getOpcode() == ISD::INLINEASM ||
15951 N->getOpcode() == ISD::INLINEASM_BR)
15953 } while (N->getOpcode() == ISD::CopyFromReg);
15960 switch (N->getOpcode()) {
15962 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15963 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15964 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15965 Register Reg = R->getReg();
15969 return !TRI->isSGPRReg(MRI, Reg);
15971 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15972 return UA->isDivergent(V);
15974 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
15975 return !TRI->isSGPRReg(MRI, Reg);
15979 unsigned AS = L->getAddressSpace();
15986 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15988 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16007 // Target-specific read-modify-write atomics are sources of divergence.
16011 // Generic read-modify-write atomics are sources of divergence.
16012 return A->readMem() && A->writeMem();
16052 if (Info->getMode().DX10Clamp)
16066 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16067 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16074 // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16079 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16084 LLVMContext &Ctx = RMW->getContext();
16087 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16089 : SSNs[RMW->getSyncScopeID()];
16093 << RMW->getOperationName(RMW->getOperation())
16099 Type *EltTy = VT->getElementType();
16100 return VT->getNumElements() == 2 &&
16101 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16109 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16114 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16119 unsigned AS = RMW->getPointerAddressSpace();
16124 OptimizationRemarkEmitter ORE(RMW->getFunction());
16131 auto SSID = RMW->getSyncScopeID();
16134 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16136 switch (RMW->getOperation()) {
16143 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16144 ConstVal && ConstVal->isNullValue())
16151 Type *Ty = RMW->getType();
16156 // is fixed to round-to-nearest-even.
16159 // round-to-nearest-even.
16162 // suggests it is OK if the floating-point mode may not match the calling
16164 if (Ty->isFloatTy()) {
16165 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
16169 if (Ty->isDoubleTy()) {
16171 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
16175 if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16185 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16190 // FIXME: Needs to account for no fine-grained memory
16191 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16195 // FIXME: Needs to account for no fine-grained memory
16196 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16200 // FIXME: Needs to account for no fine-grained memory
16201 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16205 // FIXME: Needs to account for no fine-grained memory
16206 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16211 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16215 if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16223 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16227 if (Ty->isFloatTy()) {
16228 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16229 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16232 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16236 if (RMW->use_empty() &&
16237 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
16243 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16244 if (Subtarget->hasFlatAtomicFaddF32Inst())
16252 if (Subtarget->hasLDSFPAtomicAddF32()) {
16253 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16255 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16264 Type *Ty = RMW->getType();
16267 if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16270 if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16288 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16290 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16294 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16296 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16323 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16330 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16337 return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16345 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16347 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16349 if (!TRI->isSGPRClass(RC) && !isDivergent)
16350 return TRI->getEquivalentSGPRClass(RC);
16351 if (TRI->isSGPRClass(RC) && isDivergent)
16352 return TRI->getEquivalentVGPRClass(RC);
16362 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16368 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16369 if (!IT || IT->getBitWidth() != WaveSize)
16377 for (const auto *U : V->users()) {
16379 if (V == U->getOperand(1)) {
16380 switch (Intrinsic->getIntrinsicID()) {
16391 if (V == U->getOperand(0)) {
16392 switch (Intrinsic->getIntrinsicID()) {
16414 if (CI->isInlineAsm()) {
16420 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16422 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16428 if (RC && SIRI->isSGPRClass(RC))
16435 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16439 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16454 if (N0->isDivergent() || !N1->isDivergent())
16459 hasMemSDNodeUser(*N0->use_begin()));
16481 if (User->getOpcode() != ISD::CopyToReg)
16483 if (!Def->isMachineOpcode())
16489 unsigned ResNo = User->getOperand(Op).getResNo();
16490 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16492 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16496 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16497 Cost = RC->getCopyCost();
16504 AtomicRMWInst::BinOp Op = AI->getOperation();
16508 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16509 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16511 AI->setOperation(AtomicRMWInst::Add);
16515 assert(Subtarget->hasAtomicFaddInsts() &&
16517 assert(AI->getType()->isFloatTy() &&
16518 AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16569 Function *F = BB->getParent();
16571 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16582 Value *Val = AI->getValOperand();
16583 Type *ValTy = Val->getType();
16584 Value *Addr = AI->getPointerOperand();
16587 Value *Val) -> Value * {
16589 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16590 AI->getOrdering(), AI->getSyncScopeID());
16592 AI->getAllMetadata(MDs);
16594 OldVal->setMetadata(P.first, P.second);
16598 std::prev(BB->end())->eraseFromParent();
16635 Loaded->addIncoming(LoadedShared, SharedBB);
16636 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16637 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16640 AI->replaceAllUsesWith(Loaded);
16641 AI->eraseFromParent();
16647 auto Order = AI->getOrdering();
16657 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16658 LI->setAtomic(Order, AI->getSyncScopeID());
16659 LI->copyMetadata(*AI);
16660 LI->takeName(AI);
16661 AI->replaceAllUsesWith(LI);
16662 AI->eraseFromParent();