SIISelLowering.cpp - OpenGrok cross reference for /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines Matching +full:lo +full:- +full:x2 +full:- +full:en
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
49 #define DEBUG_TYPE "si-lower"
54   "amdgpu-disable-loop-alignment",
59   "amdgpu-use-divergent-register-indexing",
66   return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
71   return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
97   const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
104   addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
110   addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
113   addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
116   addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
119   addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
122   addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
125   addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
128   addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
131   addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
134   addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
137   addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
140   addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
143   addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
146   addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
149   addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
151   if (Subtarget->has16BitInsts()) {
152     if (Subtarget->useRealTrue16Insts()) {
181   addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
183   computeRegisterProperties(Subtarget->getRegisterInfo());
186   // really produce a 1-bit result. Any copy/extend from these will turn into a
187   // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
347   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
350   // Most operations are naturally 32-bit vector operations. We only support
463   // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
469   if (Subtarget->hasSMemRealTime() ||
470       Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
474   if (Subtarget->has16BitInsts()) {
481   if (Subtarget->hasMadMacF32Insts())
484   if (!Subtarget->hasBFI())
488   if (!Subtarget->hasBCNT(32))
491   if (!Subtarget->hasBCNT(64))
494   if (Subtarget->hasFFBH())
497   if (Subtarget->hasFFBL())
500   // We only really have 32-bit BFE instructions (and 16-bit on VI).
502   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
505   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
508   if (Subtarget->hasBFE())
512   if (Subtarget->hasIntClamp())
515   if (Subtarget->hasAddNoCarry())
528   if (Subtarget->haveRoundOpsF64())
551   if (Subtarget->has16BitInsts()) {
582     // F16 - Constant Actions.
586     // F16 - Load/Store Actions.
592     // BF16 - Load/Store Actions.
598     // F16 - VOP1 Actions.
606     // F16 - VOP2 Actions.
613     // F16 - VOP3 Actions.
650     // XXX - Do these do anything? Vector constants turn into build_vector.
745                        Subtarget->hasVOP3PInsts() ? Legal : Custom);
773   if (Subtarget->hasVOP3PInsts()) {
811     if (Subtarget->hasPackedFP32Ops()) {
822   if (Subtarget->has16BitInsts()) {
843   if (Subtarget->hasScalarSMulU64())
846   if (Subtarget->hasMad64_32())
849   if (Subtarget->hasPrefetch())
852   if (Subtarget->hasIEEEMinMax()) {
925   if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
955   // FIXME: In other contexts we pretend this is a per-function property.
970 //===----------------------------------------------------------------------===//
972 //===----------------------------------------------------------------------===//
980   return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
981           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
990   return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
991           (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1014       if (Subtarget->has16BitInsts()) {
1023       return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1044     // FIXME: Should probably promote 8-bit vectors to i16.
1045     if (Size == 16 && Subtarget->has16BitInsts())
1067     // FIXME: We should fix the ABI to be the same on targets without 16-bit
1068     // support, but unless we can properly handle 3-vectors, it will be still be
1070     if (Size == 16 && Subtarget->has16BitInsts()) {
1089     if (Size < 16 && Subtarget->has16BitInsts()) {
1122   LLVMContext &Ctx = Ty->getContext();
1124     unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1125     return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1141   assert(ST->getNumContainedTypes() == 2 &&
1142          ST->getContainedType(1)->isIntegerTy(32));
1143   return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1146 /// Map address space 7 to MVT::v5i32 because that's its in-memory
1147 /// representation. This return value is vector-typed because there is no
1151 /// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1161 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1163 /// The in-memory representation of a p9 is {p8, i32, i32}, which is
1194     if (RsrcIntr->IsImage) {
1197       BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1201     Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1202     if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1203       if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1212     auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1213     if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1217       if (RsrcIntr->IsImage) {
1220         if (!BaseOpcode->Gather4) {
1224             = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1242       Type *DataTy = CI.getArgOperand(0)->getType();
1243       if (RsrcIntr->IsImage) {
1244         unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1254       Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1262         if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1266           // XXX - Should this be volatile without known ordering?
1268           Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1275         unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1303     if (!Vol->isZero())
1311     Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1326     if (!Vol->isZero())
1396     Info.ptrVal = MFI->getGWSPSV(TM);
1411     unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1424     Info.ptrVal = MFI->getGWSPSV(TM);
1445     unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1446     unsigned DstAS = I.getType()->getPointerAddressSpace();
1460   switch (II->getIntrinsicID()) {
1482     Ptr = II->getArgOperand(0);
1485     Ptr = II->getArgOperand(1);
1490   AccessTy = II->getType();
1497   if (!Subtarget->hasFlatInstOffsets()) {
1509          (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1514   if (Subtarget->hasFlatGlobalInsts())
1517   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1534   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1535   // additionally can do r + r + i with addr64. 32-bit has more addressing
1543   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1544   if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1589     if (!Subtarget->hasScalarSubwordLoads()) {
1594       if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1598     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1599       // SMRD instructions have an 8-bit, dword offset on SI.
1602     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1603       // On CI+, this can also be a 32-bit literal constant offset. If it fits
1604       // in 8-bits, it can use a smaller encoding.
1607     } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1608       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1611     } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1612       // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1617       // On GFX12, all offsets are signed 24-bit in bytes.
1625       // Scalar (non-buffer) loads can only use a negative offset if
1626       // soffset+offset is non-negative. Since the compiler can only prove that
1642     return Subtarget->enableFlatScratch()
1647       (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1648     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1650     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1651     // an 8-bit dword offset but we don't know the alignment here.
1682     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1700     if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1704     if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1717       // out-of-bounds even if base + offsets is in bounds. Split vectorized
1718       // loads here to avoid emitting ds_read2_b32. We may re-combine the
1720       if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1723       // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1728       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1737         // operates with a speed comparable to N-bit wide load". With the full
1752       if (!Subtarget->hasDS96AndDS128())
1755       // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1758       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1775       if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1778       // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1783       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1807     // Note that we have a single-dword or sub-dword here, so if underaligned
1813            Subtarget->hasUnalignedDSAccessEnabled();
1822            Subtarget->enableFlatScratch() ||
1823            Subtarget->hasUnalignedScratchAccess();
1830       !Subtarget->hasUnalignedScratchAccess()) {
1839   // than multiple smaller memory ops -- even when misaligned
1845            Subtarget->hasUnalignedBufferAccessEnabled();
1852   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1853   // byte-address are ignored, thus forcing Dword alignment.
1873   // use. Make sure we switch these to 64-bit accesses.
1888   return MemNode->getMemOperand()->getFlags() & MONoClobber;
1898   // Flat -> private/local is a simple truncate.
1899   // Flat -> global is no-op
1911   return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1938   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1943     // These operations are done with 32-bit instructions anyway.
1977       Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1986     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2024   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2026     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2052     int64_t OffsetDiff = Offset - AlignDownOffset;
2151   if (Subtarget->hasArchitectedSGPRs() &&
2184     // It's undefined behavior if a function marked with the amdgpu-no-*
2200     assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2205         !Arg->Flags.isInReg() && PSInputNum <= 15) {
2206       bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2211       if (Arg->Flags.isSplit()) {
2212         while (!Arg->Flags.isSplitEnd()) {
2213           assert((!Arg->VT.isVector() ||
2214                   Arg->VT.getScalarSizeInBits() == 16) &&
2224         Skipped.set(Arg->getOrigArgIndex());
2229       Info->markPSInputAllocated(PSInputNum);
2230       if (Arg->Used)
2231         Info->markPSInputEnabled(PSInputNum);
2253     unsigned Mask = (Subtarget->hasPackedTID() &&
2260     if (Subtarget->hasPackedTID()) {
2274     if (Subtarget->hasPackedTID()) {
2318   ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2421   // flat_scratch_init is not applicable for non-kernel functions.
2484   if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2500 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2517     // Don't preload non-original args or parts not in the current preload
2537             Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2541       unsigned Padding = ArgOffset - LastExplicitArgOffset;
2556       if (PreloadRegs->size() > 1)
2580 // Allocate special input registers that are initialized per-wave.
2586   bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2587   if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2588     // Note: user SGPRs are handled by the front-end for graphics shaders
2657   assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2671   // Record that we know we have non-spill stack objects so we don't need to
2712   // whereas non-entry functions get this "for free". This means there is no
2746   if (ST.getFrameLowering()->hasFP(MF)) {
2752   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2753   return !Info->isEntryFunction();
2763   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2765   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2769   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2770   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2771   MachineBasicBlock::iterator MBBI = Entry->begin();
2781     Register NewVR = MRI->createVirtualRegister(RC);
2783     Entry->addLiveIn(*I);
2784     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2787     // Insert the copy-back instructions right before the terminator.
2789       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2790               TII->get(TargetOpcode::COPY), *I)
2799   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2806   if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2808         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2809     DAG.getContext()->diagnose(NoGraphicsHSA);
2824     const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2826            !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2827            !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2828            !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2830     if (!Subtarget->enableFlatScratch())
2834         !Subtarget->hasArchitectedSGPRs())
2835       assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2836              !Info->hasWorkGroupIDZ());
2847     // based on run-time states. Since we can't know what the final PSInputEna
2852     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2853     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2855     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2856         ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2859       Info->markPSInputAllocated(0);
2860       Info->markPSInputEnabled(0);
2862     if (Subtarget->isAmdPalOS()) {
2864       // based on run-time states; the register values being generated here are
2871       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2874         Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2877     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2888     if (IsKernel && Subtarget->hasKernargPreload())
2897     if (!Subtarget->enableFlatScratch())
2898       CCInfo.AllocateReg(Info->getScratchRSrcReg());
2913   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2950       if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2954           int64_t OffsetDiff = Offset - AlignDownOffset;
2961               Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2981               Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2990                                   TRI->getRegSizeInBits(*RC)));
3009           // If the argument was preloaded to multiple consecutive 32-bit
3013           // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3034         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3035       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3036           ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3037                       ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3039         // less than 16-bits.  On CI and newer they could potentially be
3078         = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3083     // If this is an 8 or 16-bit value, it is really passed promoted
3119     auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3120     ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3124   Info->setBytesInStackArgArea(StackArgSize);
3149   unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3174   Info->setIfReturnsVoid(Outs.empty());
3175   bool IsWaveEnd = Info->returnsVoid() && IsShader;
3177   // CCValAssign - represent the assignment of the return value to a location.
3181   // CCState - Info about the registers and stack slots.
3226   if (!Info->isEntryFunction()) {
3227     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3229       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3326   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3331   if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3336           DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3346     {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3347     {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3348     {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3349     {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3350     {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3351     {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3352     {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3353     {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3364     if (CLI.CB->hasFnAttr(Attr.second))
3368         CalleeArgInfo->getPreloadedValue(InputID);
3380     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3403     if (OutgoingArg->isRegister()) {
3404       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3405       if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3423       CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3426         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3429         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3443   const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3444   const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3445   const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3448   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3450     if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3457   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3458       NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3466   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3467       NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3493   if (OutgoingArg->isRegister()) {
3495       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3497     CCInfo.AllocateReg(OutgoingArg->getRegister());
3536   if (Callee->isDivergent())
3542   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3543   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3577     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3578     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3595   if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3603   if (!CI->isTailCall())
3606   const Function *ParentFn = CI->getParent()->getParent();
3607   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3628     if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3635     if (RequestedExec.Ty->isIntegerTy(64)) {
3682         ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3727   // arguments to begin at SP+0. Completely unused for non-tail calls.
3737     if (!Subtarget->enableFlatScratch()) {
3742         = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3805                 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3823         SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3828             commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3853   // Build a sequence of copy-to-reg nodes chained together with token chain
3863   // We don't usually want to end the call-sequence here because we would tidy
3864   // the frame up *after* the call, however in the ABI-changing tail-call case
3878     const GlobalValue *GV = GSD->getGlobal();
3901   // Add a register mask operand representing the call-preserved registers.
3902   auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3903   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3971   Register SPReg = Info->getStackPtrOffsetReg();
3980   MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3981   const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3983     TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3988       DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3990   Align StackAlign = TFL->getStackAlign();
3994                        DAG.getConstant(-(uint64_t)Alignment->value()
3995                                            << Subtarget->getWavefrontSizeLog2(),
4007   // We only handle constant sizes here to allow non-entry block, static sized
4026   SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4047   SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4061   // [1:0] Single-precision round mode.
4062   // [3:2] Double/Half-precision round mode.
4064   // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4067   // Toward-0        3        0
4070   // -Inf            2        3
4073   // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4085   // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4095   // There's a gap in the 4-bit encoded table and actual enum values, so offset
4114   // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4118         static_cast<uint32_t>(ConstMode->getZExtValue()),
4124     // the range 0-3, we can use a simplified mapping to hardware values.
4127     // The supported standard values are 0-3. The extended values start at 8. We
4131       // Truncate to the low 32-bits.
4145       // table_index = umin(value, value - 4)
4186       DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4193   if (Op->isDivergent())
4196   switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4313   if (!Subtarget->hasFlatScrRegister() &&
4314        Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4346   MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4347   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4348   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4364   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4365   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4369   MF->insert(MBBI, LoopBB);
4370   MF->insert(MBBI, RemainderBB);
4372   LoopBB->addSuccessor(LoopBB);
4373   LoopBB->addSuccessor(RemainderBB);
4376   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4382     LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4385     RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4387     RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4398   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4402   BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4414   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4418   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4421   if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4422     Src->setIsKill(false);
4426   MachineBasicBlock::iterator I = LoopBB->end();
4432   BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4441   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4445   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4448   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4468   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4472   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4478   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4484   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4490   // Read the next variant <- also loop target.
4491   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4495   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4500   BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4512       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4519       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4522       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4531     BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4536   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4540   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4543   return InsertPt->getIterator();
4546 // This has slightly sub-optimal regalloc when the source vector is killed by
4548 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
4556   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4558   MachineRegisterInfo &MRI = MF->getRegInfo();
4562   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4569   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4572   BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4579   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4585   MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4588   MF->insert(MBBI, LandingPad);
4589   LoopBB->removeSuccessor(RemainderBB);
4590   LandingPad->addSuccessor(RemainderBB);
4591   LoopBB->addSuccessor(LandingPad);
4592   MachineBasicBlock::iterator First = LandingPad->begin();
4593   BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4622   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4624   assert(Idx->getReg() != AMDGPU::NoRegister);
4627     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4629     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4642   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4645     return Idx->getReg();
4648   BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4658   const SIRegisterInfo &TRI = TII->getRegisterInfo();
4660   MachineRegisterInfo &MRI = MF->getRegInfo();
4663   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4664   Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4665   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4668   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4677   if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4688           TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4696       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4713   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4719   MachineBasicBlock *LoopBB = InsPt->getParent();
4723         TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4730     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4744   const SIRegisterInfo &TRI = TII->getRegisterInfo();
4746   MachineRegisterInfo &MRI = MF->getRegInfo();
4749   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4750   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4751   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4752   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4753   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4754   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4757   assert(Val->getReg());
4761                                                          SrcVec->getReg(),
4765   if (Idx->getReg() == AMDGPU::NoRegister) {
4771     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4781   if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4789           TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4791           .addReg(SrcVec->getReg())
4798       const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4801           .addReg(SrcVec->getReg())
4810   if (Val->isReg())
4811     MRI.clearKillFlags(Val->getReg());
4818   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4820   MachineBasicBlock *LoopBB = InsPt->getParent();
4824         TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4832     const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4848   MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
4855   bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4861     BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4881     const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4902         BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4903     BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4905     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4908     I = ComputeLoop->end();
4910         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4914         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4915             .addReg(TmpSReg->getOperand(0).getReg())
4920     auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4921                    .addReg(ActiveBits->getOperand(0).getReg());
4923                              TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4925                          .addReg(FF1->getOperand(0).getReg());
4926     auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4927                               .addReg(Accumulator->getOperand(0).getReg())
4928                               .addReg(LaneValue->getOperand(0).getReg());
4934         BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4935             .addReg(FF1->getOperand(0).getReg())
4936             .addReg(ActiveBits->getOperand(0).getReg());
4939     Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4941     ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4946     BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4947         .addReg(NewActiveBits->getOperand(0).getReg())
4949     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4961   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4962   MachineFunction *MF = BB->getParent();
4963   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
4981     BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4983     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4992     // For targets older than GFX12, we emit a sequence of 32-bit operations.
4994     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4995     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5001     if (Subtarget->hasScalarAddSub64()) {
5003       BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5008       const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5013       MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5015       MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5018       MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5020       MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5025       BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5028       BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5031       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5042     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5043     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5054       auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5059       TII->legalizeOperands(*Add);
5064     const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5080         TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5082         TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5084     MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5086     MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5089     MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5091     MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5095     MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5103         BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5110     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5115     TII->legalizeOperands(*LoHalf);
5116     TII->legalizeOperands(*HiHalf);
5125     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5126     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5138     if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5140       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5144     if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5146       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5151     if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5152       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5158     unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5163         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5168             TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5169         MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5171         MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5175         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5179         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5184       BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5189     BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5194     BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5195         .addImm(-1)
5203             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5212     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5214         .addImm(MFI->getLDSSize());
5219     assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5220     MachineRegisterInfo &MRI = MF->getRegInfo();
5234     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5237     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5240     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5242     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5246     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5249     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5284     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5285     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5296     const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5307         TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5309         TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5311     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5313     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5316     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5318     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5321     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5323     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5329     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5336     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5345     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5347     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5349     Br->getOperand(1).setIsUndef(); // read undef SCC
5355     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5357     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5358        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5362     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5365     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5368     MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5385     if (TII->pseudoToMCOpcode(Opc) == -1) {
5390     auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5391     if (TII->isVOP3(*I)) {
5392       const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5394       I.addReg(TRI->getVCC(), RegState::Define);
5401     TII->legalizeOperands(*I);
5411     TII->legalizeOperands(MI);
5416     TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5422     if (getSubtarget()->hasGWSAutoReplay()) {
5446     if (getSubtarget()->hasDenormModeInst()) {
5465         MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5467         if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5468           unsigned ImmVal = Def->getOperand(1).getImm();
5470             BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5478             BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5491       MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5499     MI.setDesc(TII->get(AMDGPU::COPY));
5503     if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5504       MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5513     MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5514     MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5515     MF->push_back(TrapBB);
5516     BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5518     BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5521     BB->addSuccessor(TrapBB);
5526     assert(Subtarget->hasPrivEnabledTrap2NopBug());
5527     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5529         TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5534     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5571   return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5598     if (!Subtarget->hasMadMacF32Insts())
5599       return Subtarget->hasFastFMAF32();
5605       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5608     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5613     return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5642     return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5644     return Subtarget->hasMadMacF32Insts() &&
5654   EVT VT = N->getValueType(0);
5656     return Subtarget->hasMadMacF32Insts() &&
5659     return Subtarget->hasMadF16() &&
5666 //===----------------------------------------------------------------------===//
5668 //===----------------------------------------------------------------------===//
5681   SDValue Lo, Hi;
5682   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5685   SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5686                              Op->getFlags());
5688                              Op->getFlags());
5712                              Op->getFlags());
5714                              Op->getFlags());
5744                              Op->getFlags());
5746                              Op->getFlags());
5760             Result.getNode()->getNumValues() == 2) &&
5808     if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5820     return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5942   bool Unpacked = Subtarget->hasUnpackedD16VMem();
5943   EVT LoadVT = M->getValueType(0);
5964       VTList, Ops, M->getMemoryVT(),
5965       M->getMemOperand());
5976   EVT LoadVT = M->getValueType(0);
5982   assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5983   bool IsTFE = M->getNumValues() == 3;
5996     return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6000     return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6001                                M->getMemOperand(), DAG);
6007                                         M->getMemOperand(), DAG);
6015   EVT VT = N->getValueType(0);
6016   unsigned CondCode = N->getConstantOperandVal(3);
6022   SDValue LHS = N->getOperand(1);
6023   SDValue RHS = N->getOperand(2);
6037   unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6049   EVT VT = N->getValueType(0);
6051   unsigned CondCode = N->getConstantOperandVal(3);
6055   SDValue Src0 = N->getOperand(1);
6056   SDValue Src1 = N->getOperand(2);
6067   unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6078   EVT VT = N->getValueType(0);
6079   SDValue Src = N->getOperand(1);
6083     // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6088     // (ballot 0) -> 0
6089     if (Arg->isZero())
6092     // (ballot 1) -> EXEC/EXEC_LO
6093     if (Arg->isOne()) {
6106   // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6115   EVT VT = N->getValueType(0);
6117   unsigned IID = N->getConstantOperandVal(0);
6124                                           SDValue Src2, MVT ValT) -> SDValue {
6129       Operands.push_back(N->getOperand(6));
6130       Operands.push_back(N->getOperand(5));
6131       Operands.push_back(N->getOperand(4));
6150     if (SDNode *GL = N->getGluedNode()) {
6151       assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6152       GL = GL->getOperand(0).getNode();
6160   SDValue Src0 = N->getOperand(1);
6164     Src1 = N->getOperand(2);
6166       Src2 = N->getOperand(3);
6197   auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6198     EVT VT = N->getValueType(0);
6202     unsigned NumOperands = N->getNumOperands();
6204     SDNode *GL = N->getGluedNode();
6207     assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6210       for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6212         SDValue Operand = N->getOperand(j);
6226         Operands[NumOperands - 1] =
6228                         SDValue(GL->getOperand(0).getNode(), 0));
6230       Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6294   switch (N->getOpcode()) {
6306     unsigned IID = N->getConstantOperandVal(0);
6312       SDValue Src0 = N->getOperand(1);
6313       SDValue Src1 = N->getOperand(2);
6324       SDValue Src0 = N->getOperand(1);
6325       SDValue Src1 = N->getOperand(2);
6338       EVT VT = N->getValueType(0);
6353       if (!Subtarget->hasScalarSubwordLoads())
6360       assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6372       if (!Offset->isDivergent()) {
6417     EVT VT = N->getValueType(0);
6419     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6420     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6430                                     N->getOperand(0), LHS, RHS);
6438     if (N->getValueType(0) != MVT::v2f16)
6442     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6451     if (N->getValueType(0) != MVT::v2f16)
6455     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6464     if (N->getValueType(0) != MVT::f16)
6479   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6485     if (I->getOpcode() == Opcode)
6492   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6493     switch (Intr->getConstantOperandVal(1)) {
6514   return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6515           GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6520   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6525   return (GV->getValueType()->isFunctionTy() ||
6526           !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
6535   if (!GV->hasExternalLinkage())
6553   if (Intr->getOpcode() == ISD::SETCC) {
6556     Intr = SetCC->getOperand(0).getNode();
6562     Target = BR->getOperand(1);
6571   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6572                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6575         (SetCC->getConstantOperandVal(1) == 1 &&
6576          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6584   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
6587   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6604       BR->getOperand(0),
6607     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6611   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6614   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6621       CopyToReg->getOperand(1),
6622       SDValue(Result, i - 1),
6625     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6630     SDValue(Intr, Intr->getNumValues() - 1),
6631     Intr->getOperand(0));
6647   if (Info->isEntryFunction())
6654   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
6655   // Get the return address reg and mark it as an implicit live-in
6656   Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6673          "Do not know how to custom lower FP_ROUND for non-f16 type");
6696   bool IsIEEEMode = Info->getMode().IEEE;
6699   // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6759   // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6763   //    registers, then we have to split s_mul_u64 in 32-bit multiplications.
6767   //    operands are zero-extended/sign-extended from 32-bits, then we split the
6768   //    s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6769   //    possible to check if the operands are zero-extended or sign-extended in
6771   //    s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6772   //    s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6780   if (Op->isDivergent())
6785   // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6786   // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6787   // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6813     const APInt &C = RHSC->getAPIntValue();
6814     // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6834                   DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6842   if (Op->isDivergent()) {
6846   if (Subtarget->hasSMulHi()) {
6857   if (!Subtarget->isTrapHandlerEnabled() ||
6858       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6861   return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6897     Register UserSGPR = Info->getQueuePtrUserSGPR();
6901       // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6931   if (Subtarget->hasPrivEnabledTrap2NopBug())
6947   if (!Subtarget->isTrapHandlerEnabled() ||
6948       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6968   if (Subtarget->hasApertureRegs()) {
6972     // Note: this feature (register) is broken. When used as a 32-bit operand,
7008   Register UserSGPR = Info->getQueuePtrUserSGPR();
7011     // amdgpu-no-queue-ptr. This is undefined.
7044     return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7062     SrcAS = ASC->getSrcAddressSpace();
7063     Src = ASC->getOperand(0);
7064     DestAS = ASC->getDestAddressSpace();
7069     Src = Op->getOperand(1);
7070     SrcAS = Op->getConstantOperandVal(2);
7071     DestAS = Op->getConstantOperandVal(3);
7077   // flat -> local/private
7095   // local/private -> flat
7123     SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7132   // global <-> flat are no-ops and never emitted.
7137   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7139   return DAG.getUNDEF(Op->getValueType(0));
7156   unsigned IdxVal = Idx->getAsZExtVal();
7160     // Insert 32-bit registers at a time.
7222     unsigned Idx = KIdx->getZExtValue();
7227       DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7250   // Convert vector index to bit-index and get the required bit mask.
7292   // XXX - Why doesn't this get called when vector_shuffle is expanded?
7297     SDValue Lo, Hi;
7303       Lo = DAG.getBitcast(LoVT,
7317       Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7331       Lo = DAG.getBitcast(LoVT,
7342     SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7344     SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7365   // Convert vector index to bit-index (* EltSize)
7395   // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7398   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7401   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7406     if (elementPairIsContiguous(SVN->getMask(), I)) {
7407       const int Idx = SVN->getMaskElt(I);
7409       int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7411                                     PackVT, SVN->getOperand(VecIdx),
7415       const int Idx0 = SVN->getMaskElt(I);
7416       const int Idx1 = SVN->getMaskElt(I + 1);
7419       int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7420       int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7422       SDValue Vec0 = SVN->getOperand(VecIdx0);
7426       SDValue Vec1 = SVN->getOperand(VecIdx1);
7470     SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7473     SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7524   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7526   SDValue Lo = Op.getOperand(0);
7531     Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7532     SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7541   if (Lo.isUndef())
7544   Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7545   Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7547   SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7554   // 32-bit addend in the instruction, so it is not safe to allow offset folding
7555   // which can create arbitrary 64-bit addends. (This is only a problem for
7561   if (!Subtarget->isAmdHsaOS())
7565   return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7566           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7567           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7568          !shouldEmitGOTReloc(GA->getGlobal());
7575   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7576   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7586   //   constant, which is a pc-relative offset from the encoding of the $symbol
7591   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7595   //   fixups or relocations are emitted to replace $symbol@*@lo and
7597   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
7615   const GlobalValue *GV = GSD->getGlobal();
7616   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7618       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
7619       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
7620     if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7621         GV->hasExternalLinkage()) {
7622       Type *Ty = GV->getValueType();
7624       // zero-sized type in other languages to declare the dynamic shared
7629         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7632         MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7633         MFI->setUsesDynamicLDS(true);
7641   if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
7642     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7647   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7649         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7653         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7660     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7663     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7704   // The local size values will have the hi 16-bits as zero.
7712                                       "non-hsa intrinsic with hsa target",
7714   DAG.getContext()->diagnose(BadIntrin);
7723   DAG.getContext()->diagnose(BadIntrin);
7768   while (ExtraElts--)
7774 // Re-construct the required return value for a image load intrinsic.
7813                           NumDataDwords - MaskPopDwords);
7843   if (Result->getNumValues() == 1)
7853   uint64_t Value = TexFailCtrlConst->getZExtValue();
7861   *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7862   Value &= ~(uint64_t)0x2;
7877     // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7882         ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7883                                          I == DimIdx + NumGradients - 1))) {
7903       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
7904   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7905   unsigned IntrOpcode = Intr->BaseOpcode;
7910   SmallVector<EVT, 3> ResultTypes(Op->values());
7911   SmallVector<EVT, 3> OrigResultTypes(Op->values());
7926   if (BaseOpcode->Atomic) {
7930         (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7931          Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7934     if (BaseOpcode->AtomicX2) {
7949     DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7950     DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7952     if (BaseOpcode->Store) {
7957         if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7965     } else if (!BaseOpcode->NoReturn) {
7970         if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7984       if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7985           !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7994   unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7999       Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8004   VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8010   for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8012       assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8014       // occupies full 32-bit.
8020       assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8026   if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8036     if (!ST->hasA16()) {
8048   if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8051         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
8052     IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8058     // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8060                               ArgOffset + Intr->GradientStart,
8061                               ArgOffset + Intr->CoordStart, Intr->NumGradients);
8063     for (unsigned I = ArgOffset + Intr->GradientStart;
8064          I < ArgOffset + Intr->CoordStart; I++)
8071                               ArgOffset + Intr->CoordStart, VAddrEnd,
8075     for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8080   // without introducing moves, then using the non-sequential address encoding
8086   // so force non-NSA for the common 2-address case as a heuristic.
8088   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8093   const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8094   const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8095   const bool UseNSA = ST->hasNSAEncoding() &&
8096                       VAddrs.size() >= ST->getNSAThreshold(MF) &&
8104                                  ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8113   if (!BaseOpcode->Sampler) {
8117         Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8124   SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8131       // Expecting to get an error flag since TFC is on - and dmask is 0
8145     if (DMaskLanes == 0 && !BaseOpcode->Store) {
8146       // This is a no-op load. This can be eliminated
8166   unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8167   if (BaseOpcode->Atomic)
8168     CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8174   if (BaseOpcode->Store || BaseOpcode->Atomic)
8177     append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8184   Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
8185   if (BaseOpcode->Sampler)
8186     Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
8189     Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8190   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8194                 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8197   if (!Subtarget->hasGFX90AInsts()) {
8199   } else if (TFE->getAsZExtVal()) {
8202   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8205     Ops.push_back(DimInfo->DA ? True : False);
8206   if (BaseOpcode->HasD16)
8213   int Opcode = -1;
8229     if (Subtarget->hasGFX90AInsts()) {
8232       if (Opcode == -1)
8236     if (Opcode == -1 &&
8237         Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8240     if (Opcode == -1)
8244   if (Opcode == -1)
8249     MachineMemOperand *MemRef = MemOp->getMemOperand();
8253   if (BaseOpcode->AtomicX2) {
8258   if (BaseOpcode->NoReturn)
8261                            Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8280   if (!Offset->isDivergent()) {
8287     if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8296         !Subtarget->hasScalarDwordx3Loads()) {
8323   if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8347   uint64_t InstOffset = Ops[5]->getAsZExtVal();
8362   if (!Subtarget->hasArchitectedSGPRs())
8376   unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8405   // TODO: Should this propagate fast-math-flags?
8409     if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8416     if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8420       DAG.getContext()->diagnose(BadIntrin);
8429     if (MFI->isEntryFunction())
8451     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8455     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8459     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8463     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
8464     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
8473     if (Subtarget->isAmdHsaOS())
8480     if (Subtarget->isAmdHsaOS())
8487     if (Subtarget->isAmdHsaOS())
8494     if (Subtarget->isAmdHsaOS())
8501     if (Subtarget->isAmdHsaOS())
8508     if (Subtarget->isAmdHsaOS())
8515     if (Subtarget->isAmdHsaOS())
8521     if (Subtarget->isAmdHsaOS())
8527     if (Subtarget->isAmdHsaOS())
8544     if (MFI->isEntryFunction())
8550     return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8552     return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8554     return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8562     if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8583     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8616     SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8618     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8622     // There is a Pat that handles this variant, so return it as-is.
8685                                       Op->getOperand(1), Op->getOperand(2)), 0);
8694         M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8717     const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8718     auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8720         M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8777   if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8806   return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8807                                  M->getMemOperand());
8834   return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8835                                  M->getMemOperand());
8847     SDValue Chain = M->getOperand(0);
8848     SDValue M0 = M->getOperand(2);
8849     SDValue Value = M->getOperand(3);
8850     unsigned IndexOperand = M->getConstantOperandVal(7);
8851     unsigned WaveRelease = M->getConstantOperandVal(8);
8852     unsigned WaveDone = M->getConstantOperandVal(9);
8858     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8880     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8881       Offset1 |= (CountDw - 1) << 6;
8883     if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8895                                    M->getVTList(), Ops, M->getMemoryVT(),
8896                                    M->getMemOperand());
8973                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9000                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9126                                    Op->getVTList(), Ops, VT, M->getMemOperand());
9130     SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9149                                    Op->getVTList(), Ops, VT, M->getMemOperand());
9153     SDValue NodePtr = M->getOperand(2);
9154     SDValue RayExtent = M->getOperand(3);
9155     SDValue RayOrigin = M->getOperand(4);
9156     SDValue RayDir = M->getOperand(5);
9157     SDValue RayInvDir = M->getOperand(6);
9158     SDValue TDescr = M->getOperand(7);
9165     if (!Subtarget->hasGFX10_AEncoding()) {
9178     const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9179                          NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9199     assert(Opcode != -1);
9265         Ops.append(16 - Ops.size(), Undef);
9276     Ops.push_back(M->getChain());
9278     auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9279     MachineMemOperand *MemRef = M->getMemOperand();
9293       M->getOperand(0), // Chain
9294       M->getOperand(2), // Ptr
9295       M->getOperand(3)  // Value
9316     return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9317                          Ops, M->getMemOperand());
9320     SDValue Chain = Op->getOperand(0);
9326     if (isa<ConstantSDNode>(Op->getOperand(2))) {
9327       BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9341     auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9389   if (!Subtarget->hasDwordx3LoadStores() &&
9416   if (Subtarget->hasUnpackedD16VMem()) {
9430   if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9491     if (!Subtarget->hasCompressedExport()) {
9495       DAG.getContext()->diagnose(BadIntrin);
9513       Op.getOperand(3), // en
9517     unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9518     return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9571     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9572                                    M->getMemoryVT(), M->getMemOperand());
9599     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9600                                    M->getMemoryVT(), M->getMemOperand());
9649     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9650                                    M->getMemoryVT(), M->getMemOperand());
9701     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9702                                    M->getMemoryVT(), M->getMemOperand());
9716     unsigned Size = Op->getConstantOperandVal(4);
9767     MachineMemOperand *LoadMMO = M->getMemOperand();
9770     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9778     auto F = LoadMMO->getFlags() &
9782                                 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9786         LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9788     auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9795     unsigned Size = Op->getConstantOperandVal(4);
9819     if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9823       if (LHS->isDivergent())
9826       if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9835     if (!Addr->isDivergent()) {
9849     MachineMemOperand *LoadMMO = M->getMemOperand();
9850     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9851     LoadPtrI.Offset = Op->getConstantOperandVal(5);
9857     auto F = LoadMMO->getFlags() &
9861                                 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9864         LoadMMO->getAAInfo());
9866     auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9873                                       Op->getOperand(2), Chain), 0);
9877     SDValue Chain = Op->getOperand(0);
9879     SDValue BarOp = Op->getOperand(2);
9885       BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9941     auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9975     unsigned ImmOffset = C1->getZExtValue();
9985     ImmOffset -= Overflow;
10014   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10017     uint32_t Imm = C->getZExtValue();
10019     if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10030     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10032         TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10040   SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10066   SDValue Pointer = Op->getOperand(1);
10067   SDValue Stride = Op->getOperand(2);
10068   SDValue NumRecords = Op->getOperand(3);
10069   SDValue Flags = Op->getOperand(4);
10076     ConstStride = ConstNode->getZExtValue();
10148   return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10149                                      M->getMemOperand());
10172 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10173 // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10176   if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10180   unsigned AS = Ld->getAddressSpace();
10183       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10188   // pre-legalize.
10189   EVT MemVT = Ld->getMemoryVT();
10196   assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10200   SDValue Ptr = Ld->getBasePtr();
10202       ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10203       Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10204       Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10209     assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10215   if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10218   } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10219              Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10222     assert(Ld->getExtensionType() == ISD::EXTLOAD);
10225   EVT VT = Ld->getValueType(0);
10230   // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10231   // the appropriate extension from the 32-bit load.
10232   Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10252   ISD::LoadExtType ExtType = Load->getExtensionType();
10253   EVT MemVT = Load->getMemoryVT();
10262     SDValue Chain = Load->getChain();
10263     SDValue BasePtr = Load->getBasePtr();
10264     MachineMemOperand *MMO = Load->getMemOperand();
10300          "Custom lowering for non-i32 vectors hasn't been implemented.");
10302   Align Alignment = Load->getAlign();
10303   unsigned AS = Load->getAddressSpace();
10304   if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10314       !Subtarget->hasMultiDwordFlatScratchAddressing())
10315     AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10322     if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10324           (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10328     // Non-uniform loads will be selected to MUBUF instructions, so they
10337     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10338         Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10341           (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10345     // Non-uniform loads will be selected to MUBUF instructions, so they
10357     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10367     switch (Subtarget->getMaxPrivateElementSize()) {
10382       if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10391     auto Flags = Load->getMemOperand()->getFlags();
10393                                            Load->getAlign(), Flags, &Fast) &&
10402                                       MemVT, *Load->getMemOperand())) {
10431   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10438   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10450   const SDNodeFlags Flags = Op->getFlags();
10462     if (CLHS->isExactlyValue(1.0)) {
10470       // 1.0 / sqrt(x) -> rsq(x)
10472       // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10474       // 1.0 / x -> rcp(x)
10479     if (CLHS->isExactlyValue(-1.0)) {
10480       // -1.0 / x -> rcp (fneg x)
10492   // x / y -> x * (1.0 / y)
10503   const SDNodeFlags Flags = Op->getFlags();
10527   if (GlueChain->getNumValues() <= 1) {
10531   assert(GlueChain->getNumValues() == 3);
10549   if (GlueChain->getNumValues() <= 1) {
10553   assert(GlueChain->getNumValues() == 3);
10590   SDNodeFlags Flags = Op->getFlags();
10600   const APFloat K1Val(0x1p-32f);
10627   assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10628   uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10641   SDNodeFlags Flags = Op->getFlags();
10669   const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10679     // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10697     if (Subtarget->hasDenormModeInst()) {
10740     if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10805   if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10857   EVT ResultExpVT = Op->getValueType(1);
10868   if (Subtarget->hasFractBug()) {
10886   EVT VT = Store->getMemoryVT();
10889     return DAG.getTruncStore(Store->getChain(), DL,
10890        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10891        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10895          Store->getValue().getValueType().getScalarType() == MVT::i32);
10897   unsigned AS = Store->getAddressSpace();
10898   if (Subtarget->hasLDSMisalignedBug() &&
10900       Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10909       !Subtarget->hasMultiDwordFlatScratchAddressing())
10910     AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10919     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10923                                         VT, *Store->getMemOperand()))
10929     switch (Subtarget->getMaxPrivateElementSize()) {
10938           (NumElements == 3 && !Subtarget->enableFlatScratch()))
10946     auto Flags = Store->getMemOperand()->getFlags();
10948                                            Store->getAlign(), Flags, &Fast) &&
10965   assert(!Subtarget->has16BitInsts());
10966   SDNodeFlags Flags = Op->getFlags();
10980   SDNodeFlags Flags = Op->getFlags();
10991   SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11009                                            DAG.getConstant(-1, DL, MVT::i32));
11054   SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11075   //   r0 = 0.5 - h0 * g0
11079   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11083   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11088   SDNodeFlags Flags = Op->getFlags();
11093   SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11130   SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11136   // with finite only or nsz because rsq(+/-0) = +/-inf
11143   // If x is +INF, +0, or -0, use its original value
11154   // Propagate fast-math flags so that the multiply we introduce can be folded
11156   auto Flags = Op->getFlags();
11160   if (Subtarget->hasTrigReducedRange()) {
11179   assert(AtomicNode->isCompareAndSwap());
11180   unsigned AS = AtomicNode->getAddressSpace();
11186   // Non-local address space requires custom lowering for atomic compare
11200   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11201                                  Ops, VT, AtomicNode->getMemOperand());
11204 //===----------------------------------------------------------------------===//
11206 //===----------------------------------------------------------------------===//
11210   EVT VT = N->getValueType(0);
11218   SDValue Src = N->getOperand(0);
11223   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11244   SDValue MagnitudeOp = N->getOperand(0);
11245   SDValue SignOp = N->getOperand(1);
11251   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11274   // fcopysign f64:x, f64:y ->
11282   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11286 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11287 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11291 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11306   SDValue N0 = N->getOperand(0);
11307   SDValue N1 = N->getOperand(1);
11312       N0->hasOneUse())
11325   if (N0->getOpcode() == ISD::OR &&
11331   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11341   EVT VT = N->getValueType(0);
11347   Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11349                            N0->getFlags().hasNoUnsignedWrap()));
11358   switch (N->getOpcode()) {
11374   SDValue Ptr = N->getOperand(PtrIdx);
11378     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
11379                                           N->getMemoryVT(), DCI);
11381       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11397 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11398 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11399 // integer combine opportunities since most 64-bit operations are decomposed
11407   uint64_t Val = CRHS->getZExtValue();
11410   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11414         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11415     // If we need to materialize a 64-bit immediate, it will be split up later
11416     // anyway. Avoid creating the harder to understand 64-bit immediate
11450   uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11458 // or -1 if not succeeded.
11460 // value 0-3 selects corresponding source byte;
11473   uint32_t C = N1->getZExtValue();
11510   EVT VT = N->getValueType(0);
11511   SDValue LHS = N->getOperand(0);
11512   SDValue RHS = N->getOperand(1);
11527     uint64_t Mask = CRHS->getZExtValue();
11529     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11531       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11532         unsigned Shift = CShift->getZExtValue();
11533         unsigned NB = CRHS->getAPIntValue().countr_zero();
11535         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11538                                     LHS->getOperand(0),
11551     // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11566   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11569     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11570     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11584         if (!C1 || !C1->isInfinity() || C1->isNegative())
11612     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11613     // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11614     // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11621         Mask->getZExtValue() & ~OrdMask :
11622         Mask->getZExtValue() & OrdMask;
11640   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11641   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11643       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11655       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11664         // Each byte in each mask is either selector mask 0-3, or has higher
11714 //                                trunc*  255   srl   -256
11718 // *In this example, the truncs are from i32->i16
11720 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11721 // respectively. calculateSrcByte would find (given node) -> ultimate src &
11722 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11744   switch (Op->getOpcode()) {
11746     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11752     SDValue NarrowOp = Op->getOperand(0);
11754     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11755       auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11756       NarrowVT = VTSign->getVT();
11764     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11769     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11773     uint64_t BitShift = ShiftOp->getZExtValue();
11780     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11807   if (Index > BitWidth / 8 - 1)
11826     if (!LHS->isConstantZero() && !RHS->isConstantZero())
11828     if (!LHS || LHS->isConstantZero())
11830     if (!RHS || RHS->isConstantZero())
11839     auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11843     uint32_t BitMask = BitMaskOp->getZExtValue();
11855     return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11862     // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11863     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11871     uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11890     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11894     uint64_t BitShift = ShiftOp->getZExtValue();
11904     // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11908     return BytesProvided - ByteShift > Index
11909                ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11918     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11922     uint64_t BitShift = ShiftOp->getZExtValue();
11930     // of interest is Index - ByteShift of the src
11933                : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11945     SDValue NarrowOp = Op->getOperand(0);
11947     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11948         Op->getOpcode() == ISD::AssertZext ||
11949         Op->getOpcode() == ISD::AssertSext) {
11950       auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11951       NarrowBitWidth = VTSign->getVT().getSizeInBits();
11989     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11998       return L->getExtensionType() == ISD::ZEXTLOAD
12015     return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12020     auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12023     auto VecIdx = IdxOp->getZExtValue();
12035     auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12040         (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12072     auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12075       auto MemVT = L->getMemoryVT();
12078     return L->getMemoryVT().getSizeInBits() == 16;
12093   bool IsConsecutive = (Hi8 - Low8 == 1);
12164                                 : NumElements - NormalizedTrunc;
12185   [[maybe_unused]] EVT VT = N->getValueType(0);
12195     if (!P || P->isConstantZero())
12217         if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12218             ((PermOp.SrcOffset / 4) != SecondSrc->second))
12223       assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12248   SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12251     OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12261     // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12264     // ANY_EXTEND as the extended bits are dont-cares.
12277   SDValue LHS = N->getOperand(0);
12278   SDValue RHS = N->getOperand(1);
12280   EVT VT = N->getValueType(0);
12282     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12297       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12306   // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12310     uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12320   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12321   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12323       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12328       // If we have any non-vectorized use, then it is a candidate for v_perm
12329       if (OrUse->getOpcode() != ISD::BITCAST ||
12330           !OrUse->getValueType(0).isVector())
12333       // If we have any non-vectorized use, then it is a candidate for v_perm
12334       for (auto VUse : OrUse->uses()) {
12335         if (!VUse->getValueType(0).isVector())
12340         // TODO -- whitelist more uses
12342           if (VUse->getOpcode() == VectorwiseOp)
12348     if (!any_of(N->uses(), usesCombinedOperand))
12363       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12398   // (or i64:x, (zero_extend i32:y)) ->
12422   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12426                                      N->getOperand(0), CRHS))
12438   SDValue LHS = N->getOperand(0);
12439   SDValue RHS = N->getOperand(1);
12444   EVT VT = N->getValueType(0);
12451   // Make sure to apply the 64-bit constant splitting fold before trying to fold
12452   // fneg-like xors into 64-bit select.
12455     if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12457       // xor (select c, a, b), 0x80000000 ->
12461           DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12463           DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12467                                       LHS->getOperand(0), FNegLHS, FNegRHS);
12477   if (!Subtarget->has16BitInsts() ||
12481   EVT VT = N->getValueType(0);
12485   SDValue Src = N->getOperand(0);
12495   SDValue Src = N->getOperand(0);
12496   auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12501         VTSign->getVT() == MVT::i8) ||
12503         VTSign->getVT() == MVT::i16))) {
12504     assert(Subtarget->hasScalarSubwordLoads() &&
12520         Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12525         VTSign->getVT() == MVT::i8) ||
12527         VTSign->getVT() == MVT::i16)) &&
12547                                                           Ops, M->getMemoryVT(),
12548                                                           M->getMemOperand());
12558   SDValue Mask = N->getOperand(1);
12560   // fp_class x, 0 -> false
12564   if (N->getOperand(0).isUndef())
12572   EVT VT = N->getValueType(0);
12573   SDValue N0 = N->getOperand(0);
12584                            N->getFlags());
12589       N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12591                            N0.getOperand(0), N->getFlags());
12604     const auto &F = CFP->getValueAPF();
12667     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12676         if (RHS->getZExtValue() == 0xffff0000) {
12677           return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12705     if (Subtarget->supportsMinMaxDenormModes() ||
12711     // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12716       if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12723     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12724            isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12729       if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12737     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12740     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12741            isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12751     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12759         return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12801   unsigned Opcode = MI->getOpcode();
12809     if (FCR->Value.isSignaling())
12811     if (!FCR->Value.isDenormal())
12814     DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12852     return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12859     if (Subtarget->supportsMinMaxDenormModes() ||
12867     for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12868       if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12873     switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12939     // TODO: Can we use -1 as the canonical NaN value since it's an inline
12957   SDValue N0 = N->getOperand(0);
12958   EVT VT = N->getValueType(0);
12960   // fcanonicalize undef -> qnan
12967     EVT VT = N->getValueType(0);
12968     return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12971   // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12974   // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12982     SDValue Lo = N0.getOperand(0);
12984     EVT EltVT = Lo.getValueType();
12986     if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
12991                                               CFP->getValueAPF());
13065     if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13068     if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13072   EVT VT = MinK->getValueType(0);
13074   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13080   // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13089     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13109   if (K0->getValueAPF() > K1->getValueAPF())
13117   if (Info->getMode().DX10Clamp) {
13120     // FIXME: Should this be allowing -0.0?
13121     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13126   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13135     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13137     if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13138         (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13139       return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13178   EVT VT = N->getValueType(0);
13179   unsigned Opc = N->getOpcode();
13180   SDValue Op0 = N->getOperand(0);
13181   SDValue Op1 = N->getOperand(1);
13187     // max(max(a, b), c) -> max3(a, b, c)
13188     // min(min(a, b), c) -> min3(a, b, c)
13193                          N->getValueType(0),
13200     // max(a, max(b, c)) -> max3(a, b, c)
13201     // min(a, min(b, c)) -> min3(a, b, c)
13206                          N->getValueType(0),
13213   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13214   // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13217             DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13222             DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13228             DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13233             DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13237   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13243        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13244        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13256       // FIXME: Should this be allowing -0.0?
13257       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13258              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13268   EVT VT = N->getValueType(0);
13275   SDValue Src0 = N->getOperand(0);
13276   SDValue Src1 = N->getOperand(1);
13277   SDValue Src2 = N->getOperand(2);
13280     // const_a, const_b, x -> clamp is safe in all cases including signaling
13282     // FIXME: Should this be allowing -0.0?
13290   // handling no dx10-clamp?
13291   if (Info->getMode().DX10Clamp) {
13312   SDValue Src0 = N->getOperand(0);
13313   SDValue Src1 = N->getOperand(1);
13315     return DCI.DAG.getUNDEF(N->getValueType(0));
13319 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13330   // Sub-dword vectors of size 2 dword or less have better implementation.
13334   // Always expand the rest of sub-dword instructions, otherwise it will be
13339   // Always do this if var-idx is divergent, otherwise it will become a loop.
13349   if (!Subtarget->hasMovrel())
13358   SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13362   SDValue Vec = N->getOperand(0);
13369       EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13374   SDValue Vec = N->getOperand(0);
13379   EVT ResVT = N->getValueType(0);
13387     SDValue Idx = N->getOperand(1);
13393   // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13397   // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13400     SDValue Idx = N->getOperand(1);
13428       return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13433   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13436     SDValue Idx = N->getOperand(1);
13452   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13454   // multiple small extract_vector_elements with a single 32-bit extract.
13455   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13460     unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13493   SDValue Vec = N->getOperand(0);
13494   SDValue Idx = N->getOperand(2);
13498   // INSERT_VECTOR_ELT (<n x e>, var-idx)
13499   // => BUILD_VECTOR n x select (e, const-idx)
13505   SDValue Ins = N->getOperand(1);
13528     APFloat Val = CFP->getValueAPF();
13540   assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13543   SDValue TruncSrc = N->getOperand(0);
13544   EVT VT = N->getValueType(0);
13586   EVT VT = N0->getValueType(0);
13592        (VT == MVT::f16 && Subtarget->hasMadF16() &&
13599        (N0->getFlags().hasAllowContract() &&
13600         N1->getFlags().hasAllowContract())) &&
13609 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13612   EVT VT = N->getValueType(0);
13619   unsigned Opc = N->getOpcode();
13620   SDValue Op0 = N->getOperand(0);
13621   SDValue Op1 = N->getOperand(1);
13623   if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13626   if (Op0->isDivergent())
13634   if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13637   if (Op1->isDivergent())
13655 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13658 // Full 64-bit multiplies that feed into an addition are lowered here instead
13665   assert(N->getOpcode() == ISD::ADD);
13668   EVT VT = N->getValueType(0);
13670   SDValue LHS = N->getOperand(0);
13671   SDValue RHS = N->getOperand(1);
13678   if (!N->isDivergent() && Subtarget->hasSMulHi())
13691   // multiple uses, except on hardware with full-rate multiply-add (which is
13692   // part of full-rate 64-bit ops).
13693   if (!Subtarget->hasFullRate64Ops()) {
13695     for (SDNode *Use : LHS->uses()) {
13698       if (Use->getOpcode() != ISD::ADD)
13737   //   accum = mad_64_32 lhs.lo, rhs.lo, accum
13738   //   accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13739   //   accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13742   // are {sign,zero}-extended or not.
13786   if (!Byte0 || Byte0->isConstantZero()) {
13790   if (Byte1 && !Byte1->isConstantZero()) {
13837     unsigned FMask = 0xFF << (8 * (3 - Step));
13840         (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13842         (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13846     int FirstGroup = -1;
13856         Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13861     if (FirstGroup != -1) {
13869         Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13880   unsigned FMask = 0xFF << (8 * (3 - Step));
13884        ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13888        ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13901     auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13904     if (Elt->PermMask == 0x3020100)
13908                        DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13919     auto FirstMask = FirstElt->PermMask;
13920     auto SecondMask = SecondElt->PermMask;
13930         getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13932         getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13947           getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13951                       DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13964     EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13981   // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14043   EVT VT = N->getValueType(0);
14045   SDValue LHS = N->getOperand(0);
14046   SDValue RHS = N->getOperand(1);
14049     if (Subtarget->hasMad64_32()) {
14059   if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14060       (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14070       auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14071       if (MulIdx == -1)
14073       auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14076       auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14081           TempNode->getOperand(MulIdx), *Src0, *Src1,
14082           TempNode->getOperand(MulIdx)->getOperand(0),
14083           TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14091       auto AddIdx = 1 - MulIdx;
14092       // Allow the special case where add (add (mul24, 0), mul24) became ->
14094       if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14095         Src2s.push_back(TempNode->getOperand(AddIdx));
14097             handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14101             handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14105             TempNode->getOperand(AddIdx), *Src0, *Src1,
14106             TempNode->getOperand(AddIdx)->getOperand(0),
14107             TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14119       TempNode = TempNode->getOperand(AddIdx);
14122       if (TempNode->getNumOperands() < 2)
14124       LHS = TempNode->getOperand(0);
14125       RHS = TempNode->getOperand(1);
14146         Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14147         Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14148         Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14150       auto Src0Mask = Src0s.begin()->PermMask;
14154         auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14168             getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14171         auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14172                                               SecondElt->DWordOffset);
14188         DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14232     return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14241   EVT VT = N->getValueType(0);
14247   SDValue LHS = N->getOperand(0);
14248   SDValue RHS = N->getOperand(1);
14275     return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14283   if (N->getValueType(0) != MVT::i32)
14286   if (!isNullConstant(N->getOperand(1)))
14290   SDValue LHS = N->getOperand(0);
14295   unsigned Opc = N->getOpcode();
14298     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14299     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14310   EVT VT = N->getValueType(0);
14313   SDValue LHS = N->getOperand(0);
14314   SDValue RHS = N->getOperand(1);
14319   // fadd (fadd (a, a), b) -> mad 2.0, a, b
14331   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14353   EVT VT = N->getValueType(0);
14361   SDValue LHS = N->getOperand(0);
14362   SDValue RHS = N->getOperand(1);
14364     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14378     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14384         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14397   EVT VT = N->getValueType(0);
14398   if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14401   SDValue LHS = N->getOperand(0);
14402   SDValue RHS = N->getOperand(1);
14404   SDNodeFlags Flags = N->getFlags();
14405   SDNodeFlags RHSFlags = RHS->getFlags();
14407       !RHS->hasOneUse())
14412     if (CLHS->isExactlyValue(1.0) ||
14413         (IsNegative = CLHS->isExactlyValue(-1.0))) {
14414       // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14415       // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14431   EVT VT = N->getValueType(0);
14434   if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14437   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14439   SDValue Op1 = N->getOperand(0);
14440   SDValue Op2 = N->getOperand(1);
14441   SDValue FMA = N->getOperand(2);
14450   // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14453       (N->getFlags().hasAllowContract() &&
14454        FMA->getFlags().hasAllowContract())) {
14508   SDValue LHS = N->getOperand(0);
14509   SDValue RHS = N->getOperand(1);
14511   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14525       // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14526       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14527       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
14529       if ((CRHS->isAllOnes() &&
14531           (CRHS->isZero() &&
14534                            DAG.getConstant(-1, SL, MVT::i1));
14535       if ((CRHS->isAllOnes() &&
14537           (CRHS->isZero() &&
14542     const APInt &CRHSVal = CRHS->getAPIntValue();
14550       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14552       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14560                            DAG.getConstant(-1, SL, MVT::i1));
14568       (!Subtarget->has16BitInsts() || VT != MVT::f16))
14572   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14573   // (fcmp one (fabs x), inf) -> (fp_class x,
14580     const APFloat &APF = CRHS->getValueAPF();
14603   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14605   SDValue Src = N->getOperand(0);
14606   SDValue Shift = N->getOperand(0);
14613     // cvt_f32_ubyte1 (shl x,  8) -> cvt_f32_ubyte0 x
14614     // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14615     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14616     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14617     // cvt_f32_ubyte0 (srl x,  8) -> cvt_f32_ubyte1 x
14624         ShiftOffset -= C->getZExtValue();
14626         ShiftOffset += C->getZExtValue();
14640     if (N->getOpcode() != ISD::DELETED_NODE)
14648     return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14655   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14660   const APFloat &F = CSrc->getValueAPF();
14663       (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14664     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14669     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14679   switch (N->getOpcode()) {
14715     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14716     if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14717         TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14741     SDValue Src = N->getOperand(0);
14764     EVT VT = N->getValueType(0);
14766     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14769       SDValue Src = N->getOperand(0);
14819   unsigned Opcode = Node->getMachineOpcode();
14822   int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14823   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14828   unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14829   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14831   unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14832   unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14833   bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14834                   (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14838   bool HasChain = Node->getNumValues() > 1;
14852   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14860     if (!I->isMachineOpcode() ||
14861         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14868     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14899     // If the original dmask has one channel - then nothing to do
14902     // Use an arbitrary dmask - required for the instruction to work
14911   // Check for TFE or LWE - increase the number of channels by one to account
14918       AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14919   assert(NewOpcode != -1 &&
14920          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14925   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14927   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14929   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14943     DAG.setNodeMemRefs(NewNode, Node->memoperands());
14948     assert(Node->hasNUsesOfValue(1, 0));
14950                                       SDLoc(Node), Users[Lane]->getValueType(0),
14998   if (Node->getOpcode() == ISD::CopyToReg) {
14999     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15000     SDValue SrcVal = Node->getOperand(2);
15004     if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15010       SDNode *Glued = Node->getGluedNode();
15012         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15013                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15024   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15025     if (!isFrameIndexOp(Node->getOperand(i))) {
15026       Ops.push_back(Node->getOperand(i));
15032                                      Node->getOperand(i).getValueType(),
15033                                      Node->getOperand(i)), 0));
15043   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15044   unsigned Opcode = Node->getMachineOpcode();
15046   if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15047       !TII->isGather4(Opcode) &&
15064     SDValue Src0 = Node->getOperand(1);
15065     SDValue Src1 = Node->getOperand(3);
15066     SDValue Src2 = Node->getOperand(5);
15075         getRegClassFor(VT, Src0.getNode()->isDivergent());
15101     SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15106     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15120   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15121   const SIRegisterInfo &TRI = TII->getRegisterInfo();
15122   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15129   if (TII->isImage(MI)) {
15130     MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15131     MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15132     MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15137     unsigned TFEVal = TFE ? TFE->getImm() : 0;
15138     unsigned LWEVal = LWE ? LWE->getImm() : 0;
15139     unsigned D16Val = D16 ? D16->getImm() : 0;
15144     // At least one of TFE or LWE are non-zero
15149     MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15154     unsigned dmask = MO_Dmask->getImm();
15157     unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15159     bool Packed = !Subtarget->hasUnpackedD16VMem();
15164     // - this is in fact an error but this is picked up elsewhere and
15167         TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15170   } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15171     InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15185   unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15186   unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15188   BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15189   for (; SizeLeft; SizeLeft--, CurrIdx++) {
15190     NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15193     BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15195     // Insert into the super-reg
15196     BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15208   MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15215   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15217   MachineFunction *MF = MI.getParent()->getParent();
15218   MachineRegisterInfo &MRI = MF->getRegInfo();
15219   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
15221   if (TII->isVOP3(MI.getOpcode())) {
15223     TII->legalizeOperandsVOP3(MRI, MI);
15226     // This saves a chain-copy of registers and better balance register
15230       bool HasAGPRs = Info->mayNeedAGPRs();
15231       const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15236         if (I == -1)
15243         auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15244         if (!TRI->hasAGPRs(RC))
15247         if (!Src || !Src->isCopy() ||
15248             !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15250         auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15261       if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15262         if (Src2->isReg() && Src2->getReg().isVirtual()) {
15263           auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15264           if (TRI->isVectorSuperClass(RC)) {
15265             auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15266             MRI.setRegClass(Src2->getReg(), NewRC);
15267             if (Src2->isTied())
15277   if (TII->isImage(MI))
15278     TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15290   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15293   // full 128-bit register. If we are building multiple resource descriptors,
15294   // this will allow CSEing of the 2-component register.
15299     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15352 //===----------------------------------------------------------------------===//
15354 //===----------------------------------------------------------------------===//
15390         RC = TRI->getVGPRClassForBitWidth(BitWidth);
15397       if (!Subtarget->hasMAIInsts())
15404         RC = TRI->getAGPRClassForBitWidth(BitWidth);
15419     StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15437           uint32_t Width = (End - Idx + 1) * 32;
15438           MCRegister Reg = RC->getRegister(Idx);
15440             RC = TRI->getVGPRClassForBitWidth(Width);
15442             RC = TRI->getSGPRClassForBitWidth(Width);
15444             RC = TRI->getAGPRClassForBitWidth(Width);
15446             Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15452         if (!Failed && Idx < RC->getNumRegs())
15453           return std::pair(RC->getRegister(Idx), RC);
15460     Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15528   if (Size == 16 && !Subtarget->has16BitInsts())
15532     Val = C->getSExtValue();
15536     Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15544     if (ConstantSDNode *C = V->getConstantSplatNode()) {
15545       Val = C->getSExtValue();
15548     if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15549       Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15592   bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15663     return -1;
15668 // the function is legalized do we know all of the non-spill stack objects or if
15674   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15677   if (Info->isEntryFunction()) {
15686                       ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15687                       : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15689   Info->setSGPRForEXECCopy(SReg);
15691   assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15692                              Info->getStackPtrOffsetReg()));
15693   if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15694     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15698   if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15699     MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15701   if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15702     MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15704   Info->limitOccupancy(MF);
15709         TII->fixImplicitOperands(MI);
15716   // per-subtarget, but there's no easy way to achieve that right now. This is
15725       int NewClassID = getAlignedAGPRClassID(RC->getID());
15726       if (NewClassID != -1)
15727         MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15749       // These return at most the (wavefront size - 1) + src1
15758         Known.Zero.setHighBits(Size - MaxActiveBits);
15776   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15790   switch (MI->getOpcode()) {
15793     switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15805       // These return at most the wavefront size - 1.
15807       Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15815           llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15829     auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15861     Intrinsic::ID IID = GI->getIntrinsicID();
15874   // Pre-GFX10 target did not benefit from loop alignment
15875   if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15876       getSubtarget()->hasInstFwdPrefetchBug())
15889   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15890   const MachineBasicBlock *Header = ML->getHeader();
15891   if (Header->getAlignment() != PrefAlign)
15892     return Header->getAlignment(); // Already processed.
15895   for (const MachineBasicBlock *MBB : ML->blocks()) {
15899       LoopSize += MBB->getAlignment().value() / 2;
15902       LoopSize += TII->getInstSizeInBytes(MI);
15916   for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15917     if (MachineBasicBlock *Exit = P->getExitBlock()) {
15918       auto I = Exit->getFirstNonDebugInstr();
15919       if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15924   MachineBasicBlock *Pre = ML->getLoopPreheader();
15925   MachineBasicBlock *Exit = ML->getExitBlock();
15928     auto PreTerm = Pre->getFirstTerminator();
15929     if (PreTerm == Pre->begin() ||
15930         std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15931       BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15934     auto ExitHead = Exit->getFirstNonDebugInstr();
15935     if (ExitHead == Exit->end() ||
15936         ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15937       BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15946   assert(N->getOpcode() == ISD::CopyFromReg);
15949     N = N->getOperand(0).getNode();
15950     if (N->getOpcode() == ISD::INLINEASM ||
15951         N->getOpcode() == ISD::INLINEASM_BR)
15953   } while (N->getOpcode() == ISD::CopyFromReg);
15960   switch (N->getOpcode()) {
15962     const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15963     const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15964     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15965     Register Reg = R->getReg();
15969       return !TRI->isSGPRReg(MRI, Reg);
15971     if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15972       return UA->isDivergent(V);
15974     assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
15975     return !TRI->isSGPRReg(MRI, Reg);
15979     unsigned AS = L->getAddressSpace();
15986     return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15988     return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16007     // Target-specific read-modify-write atomics are sources of divergence.
16011       // Generic read-modify-write atomics are sources of divergence.
16012       return A->readMem() && A->writeMem();
16052     if (Info->getMode().DX10Clamp)
16066   const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16067   auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16074 // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16079   return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16084   LLVMContext &Ctx = RMW->getContext();
16087   StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16089                            : SSNs[RMW->getSyncScopeID()];
16093          << RMW->getOperationName(RMW->getOperation())
16099     Type *EltTy = VT->getElementType();
16100     return VT->getNumElements() == 2 &&
16101            (EltTy->isHalfTy() || EltTy->isBFloatTy());
16109   return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16114   return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16119   unsigned AS = RMW->getPointerAddressSpace();
16124     OptimizationRemarkEmitter ORE(RMW->getFunction());
16131   auto SSID = RMW->getSyncScopeID();
16134       SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16136   switch (RMW->getOperation()) {
16143       if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16144           ConstVal && ConstVal->isNullValue())
16151     Type *Ty = RMW->getType();
16156       // is fixed to round-to-nearest-even.
16159       // round-to-nearest-even.
16162       // suggests it is OK if the floating-point mode may not match the calling
16164       if (Ty->isFloatTy()) {
16165         return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
16169       if (Ty->isDoubleTy()) {
16171         return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
16175       if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16185     if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16190       // FIXME: Needs to account for no fine-grained memory
16191       if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16195       // FIXME: Needs to account for no fine-grained memory
16196       if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16200       // FIXME: Needs to account for no fine-grained memory
16201       if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16205       // FIXME: Needs to account for no fine-grained memory
16206       if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16211       if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16215     if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16223     if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16227       if (Ty->isFloatTy()) {
16228         // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16229         if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16232         if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16236         if (RMW->use_empty() &&
16237             Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
16243     if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16244       if (Subtarget->hasFlatAtomicFaddF32Inst())
16252       if (Subtarget->hasLDSFPAtomicAddF32()) {
16253         if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16255         if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16264     Type *Ty = RMW->getType();
16267     if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16270     if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16288       if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16290       if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16294       if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16296       if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16323   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16330   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16337   return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16345   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16347     return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16349   if (!TRI->isSGPRClass(RC) && !isDivergent)
16350     return TRI->getEquivalentSGPRClass(RC);
16351   if (TRI->isSGPRClass(RC) && isDivergent)
16352     return TRI->getEquivalentVGPRClass(RC);
16362 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16368   IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16369   if (!IT || IT->getBitWidth() != WaveSize)
16377   for (const auto *U : V->users()) {
16379       if (V == U->getOperand(1)) {
16380         switch (Intrinsic->getIntrinsicID()) {
16391       if (V == U->getOperand(0)) {
16392         switch (Intrinsic->getIntrinsicID()) {
16414     if (CI->isInlineAsm()) {
16420       const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16422           MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16428           if (RC && SIRI->isSGPRClass(RC))
16435   return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16439   SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16454   if (N0->isDivergent() || !N1->isDivergent())
16459           hasMemSDNodeUser(*N0->use_begin()));
16481   if (User->getOpcode() != ISD::CopyToReg)
16483   if (!Def->isMachineOpcode())
16489   unsigned ResNo = User->getOperand(Op).getResNo();
16490   if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16492   const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16496         TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16497     Cost = RC->getCopyCost();
16504   AtomicRMWInst::BinOp Op = AI->getOperation();
16508     // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16509     assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16511     AI->setOperation(AtomicRMWInst::Add);
16515   assert(Subtarget->hasAtomicFaddInsts() &&
16517   assert(AI->getType()->isFloatTy() &&
16518          AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16569   Function *F = BB->getParent();
16571       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16582   Value *Val = AI->getValOperand();
16583   Type *ValTy = Val->getType();
16584   Value *Addr = AI->getPointerOperand();
16587                                  Value *Val) -> Value * {
16589         Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16590                                 AI->getOrdering(), AI->getSyncScopeID());
16592     AI->getAllMetadata(MDs);
16594       OldVal->setMetadata(P.first, P.second);
16598   std::prev(BB->end())->eraseFromParent();
16635   Loaded->addIncoming(LoadedShared, SharedBB);
16636   Loaded->addIncoming(LoadedPrivate, PrivateBB);
16637   Loaded->addIncoming(LoadedGlobal, GlobalBB);
16640   AI->replaceAllUsesWith(Loaded);
16641   AI->eraseFromParent();
16647   auto Order = AI->getOrdering();
16657       AI->getType(), AI->getPointerOperand(), AI->getAlign());
16658   LI->setAtomic(Order, AI->getSyncScopeID());
16659   LI->copyMetadata(*AI);
16660   LI->takeName(AI);
16661   AI->replaceAllUsesWith(LI);
16662   AI->eraseFromParent();