Lines Matching +full:nvptx +full:- +full:-

1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines an instruction selector for the NVPTX target.
11 //===----------------------------------------------------------------------===//
14 #include "NVPTX.h"
31 #define DEBUG_TYPE "nvptx-isel"
32 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
35 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
38 /// createNVPTXISelDag - This pass converts a legalized DAG into a
39 /// NVPTX-specific DAG, ready for instruction scheduling.
67 return Subtarget->getTargetLowering()->getDivF32Level();
71 return Subtarget->getTargetLowering()->usePrecSqrtF32();
75 return Subtarget->getTargetLowering()->useF32FTZ(*MF);
79 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
80 return TL->allowFMA(*MF, OptLevel);
84 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
85 return TL->allowUnsafeFPMath(*MF);
90 /// Select - Select instructions not customized! Used for
94 if (N->isMachineOpcode()) {
95 N->setNodeId(-1);
99 switch (N->getOpcode()) {
182 if (N->getOperand(1).getValueType() == MVT::i128) {
189 if (N->getOperand(1).getValueType() == MVT::i128) {
208 unsigned IID = N->getConstantOperandVal(1);
222 using NVPTX::PTXCmpMode::CmpMode;
271 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
278 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
280 SDNode *SetP = CurDAG->getMachineNode(
281 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
282 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
289 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
291 SDNode *SetP = CurDAG->getMachineNode(
292 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
293 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
301 SDValue Vector = N->getOperand(0);
310 for (auto *U : Vector.getNode()->users()) {
311 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
313 if (U->getOperand(0) != Vector)
316 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
317 if (IdxConst->getZExtValue() == 0)
319 else if (IdxConst->getZExtValue() == 1)
335 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
345 const Value *Src = N->getMemOperand()->getValue();
348 return NVPTX::AddressSpace::Generic;
350 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
351 switch (PT->getAddressSpace()) {
353 return NVPTX::AddressSpace::Local;
355 return NVPTX::AddressSpace::Global;
357 return NVPTX::AddressSpace::Shared;
359 return NVPTX::AddressSpace::Generic;
361 return NVPTX::AddressSpace::Param;
363 return NVPTX::AddressSpace::Const;
367 return NVPTX::AddressSpace::Generic;
373 NVPTX::Ordering InstructionOrdering, FenceOrdering;
374 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
375 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
381 AtomicOrdering Ordering = N->getSuccessOrdering();
384 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
385 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
387 // clang-format off
393 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
394 // |---------|----------|--------------------|------------|------------------------------|
407 // | | | | | or .volatile (PTX 8.1-) |
418 // |------------------------------------------------------|-------------------------------|
421 // |------------------------------------------------------|-------------------------------|
424 // |------------------------------------------------------|-------------------------------|
427 // |------------------------------------------------------|-------------------------------|
431 // clang-format on
440 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
441 // PTX instructions fails to preserve their C++ side-effects.
443 // Example (https://github.com/llvm/llvm-project/issues/62057):
450 // A C++ program that calls "example" is well-defined: the infinite loop
461 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
463 // to weak memory operations in sm_60- is therefore fine.
467 // - the "weak" memory instruction we are currently lowering to, and
468 // - some other instruction that preserves the side-effect, e.g.,
470 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
471 CodeAddrSpace == NVPTX::AddressSpace::Const ||
472 CodeAddrSpace == NVPTX::AddressSpace::Param) {
473 return NVPTX::Ordering::NotAtomic;
497 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
498 CodeAddrSpace == NVPTX::AddressSpace::Global ||
499 CodeAddrSpace == NVPTX::AddressSpace::Shared);
501 return NVPTX::Ordering::NotAtomic;
504 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
508 return N->isVolatile() ? NVPTX::Ordering::Volatile
509 : NVPTX::Ordering::NotAtomic;
514 if (N->isVolatile())
515 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
516 : NVPTX::Ordering::Volatile;
518 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
519 : NVPTX::Ordering::Volatile;
523 if (!N->readMem())
526 N->getOperationName()));
527 return NVPTX::Ordering::Acquire;
529 if (!N->writeMem())
532 N->getOperationName()));
533 return NVPTX::Ordering::Release;
536 formatv("NVPTX does not support AcquireRelease Ordering on "
537 "read-modify-write "
539 N->getOperationName()));
542 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
545 // whether the memory operation is a read, write, or read-modify-write.
549 NVPTX::Ordering InstrOrder;
550 if (N->readMem())
551 InstrOrder = NVPTX::Ordering::Acquire;
552 else if (N->writeMem())
553 InstrOrder = NVPTX::Ordering::Release;
556 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
557 "read-modify-writes yet: {}",
558 N->getOperationName()));
560 NVPTX::Ordering::SequentiallyConsistent);
564 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
570 NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
571 NVPTX::Ordering O) const {
573 case NVPTX::Ordering::NotAtomic:
574 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
575 // NVPTX uses Thread scope as the scope of non-atomic operations.
576 return NVPTX::Scope::Thread;
577 case NVPTX::Ordering::RelaxedMMIO:
581 return NVPTX::Scope::System;
582 case NVPTX::Ordering::Relaxed:
583 case NVPTX::Ordering::Acquire:
584 case NVPTX::Ordering::Release:
585 case NVPTX::Ordering::AcquireRelease:
586 case NVPTX::Ordering::SequentiallyConsistent:
587 auto S = Scopes[N->getSyncScopeID()];
590 if (S == NVPTX::Scope::Thread)
595 if (S == NVPTX::Scope::Cluster)
596 Subtarget->failIfClustersUnsupported("cluster scope");
599 return N->isVolatile() ? NVPTX::Scope::System : S;
613 // - constant global variables, and
614 // - kernel function pointer params that are noalias (i.e. __restrict) and
620 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
623 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global)
626 if (N->isInvariant())
629 bool IsKernelFn = isKernelFunction(F->getFunction());
635 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
639 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
641 return GV->isConstant();
646 static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
648 if (S == NVPTX::Scope::Cluster)
649 T->failIfClustersUnsupported(".cluster scope fence");
652 case NVPTX::Ordering::Acquire:
653 case NVPTX::Ordering::Release:
654 case NVPTX::Ordering::AcquireRelease: {
656 case NVPTX::Scope::System:
657 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
658 : NVPTX::INT_MEMBAR_SYS;
659 case NVPTX::Scope::Block:
660 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
661 : NVPTX::INT_MEMBAR_CTA;
662 case NVPTX::Scope::Cluster:
663 return NVPTX::atomic_thread_fence_acq_rel_cluster;
664 case NVPTX::Scope::Device:
665 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
666 : NVPTX::INT_MEMBAR_GL;
667 case NVPTX::Scope::Thread:
674 case NVPTX::Ordering::SequentiallyConsistent: {
676 case NVPTX::Scope::System:
677 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
678 : NVPTX::INT_MEMBAR_SYS;
679 case NVPTX::Scope::Block:
680 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
681 : NVPTX::INT_MEMBAR_CTA;
682 case NVPTX::Scope::Cluster:
683 return NVPTX::atomic_thread_fence_seq_cst_cluster;
684 case NVPTX::Scope::Device:
685 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
686 : NVPTX::INT_MEMBAR_GL;
687 case NVPTX::Scope::Thread:
693 case NVPTX::Ordering::NotAtomic:
694 case NVPTX::Ordering::Relaxed:
695 case NVPTX::Ordering::Volatile:
696 case NVPTX::Ordering::RelaxedMMIO:
707 std::pair<NVPTX::Ordering, NVPTX::Scope>
715 switch (NVPTX::Ordering(FenceOrdering)) {
716 case NVPTX::Ordering::NotAtomic:
718 case NVPTX::Ordering::SequentiallyConsistent: {
720 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
726 OrderingToString(NVPTX::Ordering(FenceOrdering))));
732 unsigned IID = N->getConstantOperandVal(0);
744 SDValue Wrapper = N->getOperand(1);
746 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
751 SDValue Src = N->getOperand(0);
753 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
754 unsigned DstAddrSpace = CastN->getDestAddressSpace();
764 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
765 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
774 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
777 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
780 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
783 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
786 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
791 report_fatal_error("Cannot cast between two non-generic address spaces");
796 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
799 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
802 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
805 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
808 Opc = TM.is64Bit() ? NVPTX::IMOV64rr : NVPTX::IMOV32rr;
812 SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
815 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
816 CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
866 return NVPTX::PTXLdStInstCode::Untyped;
868 return NVPTX::PTXLdStInstCode::Float;
871 return NVPTX::PTXLdStInstCode::Unsigned;
876 assert(LD->readMem() && "Expected load");
880 if (PlainLoad && PlainLoad->isIndexed())
883 EVT LoadedVT = LD->getMemoryVT();
893 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
896 SDValue Chain = N->getOperand(0);
907 // Read at least 8 bits (predicates are stored as 8-bit values)
912 unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
920 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
921 FromType = NVPTX::PTXLdStInstCode::Signed;
926 SDValue N1 = N->getOperand(1);
930 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
938 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
939 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
940 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
946 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
947 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
948 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
956 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
957 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
958 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
960 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
961 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
962 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
969 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
970 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
971 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
973 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
974 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
975 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
982 CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
986 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
987 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
994 // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
999 // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
1006 EVT LoadedVT = MemSD->getMemoryVT();
1016 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1019 SDValue Chain = N->getOperand(0);
1032 // Read at least 8 bits (predicates are stored as 8-bit values)
1037 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1039 FromType = NVPTX::PTXLdStInstCode::Signed;
1045 switch (N->getOpcode()) {
1047 VecType = NVPTX::PTXLdStInstCode::V2;
1050 VecType = NVPTX::PTXLdStInstCode::V4;
1056 EVT EltVT = N->getValueType(0);
1060 FromType = NVPTX::PTXLdStInstCode::Untyped;
1064 SDValue Op1 = N->getOperand(1);
1075 switch (N->getOpcode()) {
1080 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1081 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1082 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1086 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1087 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1088 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1097 switch (N->getOpcode()) {
1102 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1103 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1104 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1108 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1109 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1110 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1120 switch (N->getOpcode()) {
1126 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1127 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1128 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1132 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1133 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1134 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1138 switch (N->getOpcode()) {
1143 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1144 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1145 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1149 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1150 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1151 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1160 switch (N->getOpcode()) {
1165 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1166 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1167 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1168 NVPTX::LDV_f64_v2_areg_64);
1172 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1173 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1174 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1178 switch (N->getOpcode()) {
1183 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1184 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1185 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1186 NVPTX::LDV_f64_v2_areg);
1190 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1191 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1192 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1200 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1202 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1203 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1214 SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1216 EVT OrigType = N->getValueType(0);
1217 EVT EltVT = Mem->getMemoryVT();
1237 // 8-bit registers in NVPTX.
1244 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1245 SDValue Chain = N->getOperand(0);
1253 switch (N->getOpcode()) {
1258 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1259 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1260 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1261 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1265 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1266 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1267 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1268 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1272 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1273 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1274 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1275 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1276 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1277 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1281 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1282 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1283 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1284 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1285 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1286 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1290 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1291 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1292 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1293 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1297 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1298 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1299 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1300 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1306 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1310 switch (N->getOpcode()) {
1315 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1316 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1317 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1318 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1319 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1320 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1324 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1325 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1326 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1327 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1328 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1329 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1333 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1334 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1335 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1336 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1337 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1338 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1342 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1343 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1344 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1345 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1346 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1347 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1351 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1352 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1353 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1354 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1358 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1359 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1360 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1361 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1365 switch (N->getOpcode()) {
1370 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1371 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1372 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1373 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1377 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1378 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1379 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1380 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1384 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1385 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1386 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1387 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1388 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1389 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1393 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1394 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1395 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1396 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1397 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1398 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1402 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1403 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1404 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1405 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1409 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1410 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1411 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1412 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1419 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1422 switch (N->getOpcode()) {
1427 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1428 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1429 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1430 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1431 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1432 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1436 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1437 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1438 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1439 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1440 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1441 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1445 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1446 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1447 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1448 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1449 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1450 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1454 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1455 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1456 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1457 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1458 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1459 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1463 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1464 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1465 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1466 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1470 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1471 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1472 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1473 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1477 switch (N->getOpcode()) {
1482 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1483 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1484 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1485 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1486 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1487 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1491 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1492 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1493 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1494 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1495 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1496 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1500 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1501 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1502 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1503 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1504 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1505 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1509 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1510 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1511 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1512 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1513 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1514 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1518 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1519 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1520 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1521 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1525 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1526 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1527 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1528 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1535 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1546 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1553 // We have an extending-load. The instruction we selected operates on the
1559 // For each output value, apply the manual sign/zero-extension and make sure
1566 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1567 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1579 assert(ST->writeMem() && "Expected store");
1585 if (PlainStore && PlainStore->isIndexed())
1588 EVT StoreVT = ST->getMemoryVT();
1595 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1598 SDValue Chain = ST->getChain();
1603 unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
1606 // - for integer type, always use 'u'
1619 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1620 SDValue BasePtr = ST->getBasePtr();
1625 Value.getNode()->getSimpleValueType(0).SimpleTy;
1633 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1634 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1635 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1642 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1643 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1644 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1653 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1654 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1655 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1657 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1658 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1659 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1666 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1667 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1668 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1670 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1671 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1672 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1678 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1683 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1684 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1690 SDValue Op1 = N->getOperand(1);
1696 EVT StoreVT = MemSD->getMemoryVT();
1700 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1705 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1708 SDValue Chain = N->getOperand(0);
1712 // - for integer type, always use 'u'
1722 switch (N->getOpcode()) {
1724 VecType = NVPTX::PTXLdStInstCode::V2;
1725 Ops.append({N->getOperand(1), N->getOperand(2)});
1726 N2 = N->getOperand(3);
1729 VecType = NVPTX::PTXLdStInstCode::V4;
1730 Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3),
1731 N->getOperand(4)});
1732 N2 = N->getOperand(5);
1740 ToType = NVPTX::PTXLdStInstCode::Untyped;
1749 switch (N->getOpcode()) {
1754 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1755 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1756 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1760 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1761 NVPTX::STV_i32_v4_avar, std::nullopt,
1762 NVPTX::STV_f32_v4_avar, std::nullopt);
1768 switch (N->getOpcode()) {
1773 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1774 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1775 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1779 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1780 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1781 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1788 switch (N->getOpcode()) {
1794 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1795 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1796 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1800 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1801 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1802 NVPTX::STV_f32_v4_ari_64, std::nullopt);
1806 switch (N->getOpcode()) {
1811 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1812 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1813 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1817 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1818 NVPTX::STV_i32_v4_ari, std::nullopt,
1819 NVPTX::STV_f32_v4_ari, std::nullopt);
1826 switch (N->getOpcode()) {
1831 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
1832 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1833 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1834 NVPTX::STV_f64_v2_areg_64);
1838 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
1839 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1840 NVPTX::STV_f32_v4_areg_64, std::nullopt);
1844 switch (N->getOpcode()) {
1849 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
1850 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
1851 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
1852 NVPTX::STV_f64_v2_areg);
1856 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
1857 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
1858 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
1870 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1872 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1873 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
1880 SDValue Chain = Node->getOperand(0);
1881 SDValue Offset = Node->getOperand(2);
1882 SDValue Glue = Node->getOperand(3);
1887 switch (Node->getOpcode()) {
1901 EVT EltVT = Node->getValueType(0);
1902 EVT MemVT = Mem->getMemoryVT();
1911 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
1912 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
1913 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
1917 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
1918 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
1919 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
1920 NVPTX::LoadParamMemV2F64);
1924 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
1925 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
1926 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
1934 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
1936 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
1939 VTs = CurDAG->getVTList(EVTs);
1942 unsigned OffsetVal = Offset->getAsZExtVal();
1945 {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
1947 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
1953 SDValue Chain = N->getOperand(0);
1954 SDValue Offset = N->getOperand(1);
1955 unsigned OffsetVal = Offset->getAsZExtVal();
1960 switch (N->getOpcode()) {
1977 Ops.push_back(N->getOperand(i + 2));
1978 Ops.append({CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain});
1981 // If we have an i1, use an 8-bit store. The lowering code in
1988 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
1989 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
1990 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
1991 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
1992 if (Opcode == NVPTX::StoreRetvalI8) {
2000 Opcode = NVPTX::StoreRetvalI8TruncI32;
2003 Opcode = NVPTX::StoreRetvalI8TruncI64;
2009 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2010 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2011 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2012 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2015 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2016 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2017 NVPTX::StoreRetvalV4I32, std::nullopt,
2018 NVPTX::StoreRetvalV4F32, std::nullopt);
2024 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2025 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2026 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2032 // Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
2034 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2043 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2078 const ConstantFP *CF = ConstImm->getConstantFPValue();
2079 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2082 const ConstantInt *CI = ConstImm->getConstantIntValue();
2083 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2109 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2110 : NVPTX::StoreParamV4I8_rrrr;
2113 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2114 : NVPTX::StoreParamV4I16_rrrr;
2119 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2120 : NVPTX::StoreParamV4I32_rrrr;
2128 SDValue Chain = N->getOperand(0);
2129 SDValue Param = N->getOperand(1);
2130 unsigned ParamVal = Param->getAsZExtVal();
2131 SDValue Offset = N->getOperand(2);
2132 unsigned OffsetVal = Offset->getAsZExtVal();
2134 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2138 switch (N->getOpcode()) {
2157 Ops.push_back(N->getOperand(i + 3));
2158 Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
2159 CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
2162 // If we have an i1, use an 8-bit store. The lowering code in
2165 switch (N->getOpcode()) {
2171 MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
2178 const ConstantFP *CF = ConstImm->getConstantFPValue();
2179 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2182 const ConstantInt *CI = ConstImm->getConstantIntValue();
2183 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2187 Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
2188 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2189 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2190 NVPTX::StoreParamF64_i);
2193 pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2194 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2195 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2196 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2197 if (Opcode == NVPTX::StoreParamI8_r) {
2205 Opcode = NVPTX::StoreParamI8TruncI32_r;
2208 Opcode = NVPTX::StoreParamI8TruncI64_r;
2216 MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
2222 // Special case: if we have a sign-extend/zero-extend node, insert the
2226 Opcode = NVPTX::StoreParamI32_r;
2227 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2229 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2235 Opcode = NVPTX::StoreParamI32_r;
2236 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2238 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2245 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2246 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2247 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2248 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2254 /// SelectBFE - Look for instruction sequences that can be made more efficient
2255 /// by using the 'bfe' (bit-field extract) PTX instruction
2258 SDValue LHS = N->getOperand(0);
2259 SDValue RHS = N->getOperand(1);
2265 if (N->getOpcode() == ISD::AND) {
2279 uint64_t MaskVal = Mask->getZExtValue();
2282 // 'and' operation to fix up the low-order bits so we would trade
2289 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
2293 Val = LHS.getNode()->getOperand(0);
2294 Start = LHS.getNode()->getOperand(1);
2297 uint64_t StartVal = StartConst->getZExtValue();
2300 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
2307 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
2311 // require run-time logic that would be more expensive than just
2318 // 'and' -> 'bfe', but 'and' has higher-throughput.
2321 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
2322 if (LHS->getOpcode() == ISD::AND) {
2329 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
2331 SDValue AndLHS = LHS->getOperand(0);
2332 SDValue AndRHS = LHS->getOperand(1);
2345 uint64_t MaskVal = MaskCnst->getZExtValue();
2352 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
2359 NumBits = NumZeros + NumOnes - ShiftAmt;
2367 // transformation non-profitable
2372 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
2373 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
2374 } else if (LHS->getOpcode() == ISD::SHL) {
2382 Val = LHS->getOperand(0);
2384 SDValue ShlRHS = LHS->getOperand(1);
2390 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
2398 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
2412 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
2414 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
2417 if (N->getOpcode() == ISD::SRA) {
2437 Opc = NVPTX::BFE_S32rii;
2439 Opc = NVPTX::BFE_U32rii;
2443 Opc = NVPTX::BFE_S64rii;
2445 Opc = NVPTX::BFE_U64rii;
2456 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
2467 if (STI->hasNativeBF16Support(N->getOpcode()))
2473 SDValue N0 = N->getOperand(0);
2474 SDValue N1 = N->getOperand(1);
2476 auto GetConstant = [&](float Value) -> SDValue {
2485 auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
2486 return SDValue(CurDAG->getMachineNode(NVPTX::IMOV32ri, DL, VT, Const), 0);
2488 auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
2489 return SDValue(CurDAG->getMachineNode(NVPTX::BFMOV16ri, DL, VT, Const), 0);
2492 switch (N->getOpcode()) {
2494 // add(a, b) -> fma(a, 1.0, b)
2498 // sub(a, b) -> fma(b, -1.0, a)
2499 Operands = {N1, GetConstant(-1.0), N0};
2502 // mul(a, b) -> fma(a, b, -0.0)
2503 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
2504 Operands = {N0, N1, GetConstant(-0.0)};
2510 int Opcode = IsVec ? NVPTX::BFMA16x2rrr : NVPTX::BFMA16rrr;
2511 MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, DL, VT, Operands);
2518 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
2521 // SelectDirectAddr - Match a direct address for DAG.
2534 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
2536 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
2537 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
2538 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
2539 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
2551 uint64_t AccumulatedOffset) -> std::optional<uint64_t> {
2555 AccumulatedOffset += CN->getZExtValue();
2565 Offset = CurDAG->getTargetConstant(*AccumulatedOffset, SDLoc(OpNode), VT);
2588 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
2589 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), VT);
2604 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
2608 // Offset must fit in a 32-bit signed int in PTX [register+offset] address
2610 if (!CN->getAPIntValue().isSignedIntN(32))
2613 Offset = CurDAG->getSignedTargetConstant(CN->getSExtValue(),
2637 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
2639 Src = mN->getMemOperand()->getValue();
2643 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
2644 return (PT->getAddressSpace() == spN);
2648 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
2660 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
2674 // Lower a CopyToReg with two 64-bit inputs
2683 SDValue Dst = N->getOperand(1);
2684 SDValue Lo = N->getOperand(2);
2685 SDValue Hi = N->getOperand(3);
2689 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
2691 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
2692 NewOps[0] = N->getOperand(0);
2695 if (N->getNumOperands() == 5)
2696 NewOps[3] = N->getOperand(4);
2697 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
2703 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
2712 SDValue Ch = N->getOperand(0);
2713 SDValue Src = N->getOperand(1);
2714 SDValue Glue = N->getOperand(2);
2719 SDNode *Mov = CurDAG->getMachineNode(
2720 NVPTX::I128toV2I64, DL,
2727 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
2731 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
2740 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
2742 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
2744 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
2751 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
2753 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
2755 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
2762 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
2764 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
2766 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
2773 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
2775 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
2777 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
2784 return NVPTX::CVT_f32_f16;
2786 return NVPTX::CVT_f64_f16;
2793 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
2795 getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
2796 Scopes[N->getConstantOperandVal(2)], Subtarget);
2797 SDValue Chain = N->getOperand(0);
2798 SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
2804 Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
2805 Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
2806 Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
2807 Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
2808 Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
2811 NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
2813 llvm_unreachable("NVPTX Scopes must be initialized before calling "
2819 // - Add API to LLVMContext to get the name of a single scope.
2820 // - Use that API here to print an error containing the name
2824 return S->second;
2831 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
2832 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
2846 [&]() -> auto { \
2857 (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2858 : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2995 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2996 // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
3001 size_t NumOps = N->getNumOperands();
3002 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
3003 : (NumOps - 9);
3004 // Offsets is always 'NumDims - 2' and only for im2col mode
3005 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
3006 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3007 bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
3012 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
3016 Ops.push_back(N->getOperand(MultiCastIdx));
3020 Ops.push_back(N->getOperand(MultiCastIdx + 1));
3023 Ops.push_back(N->getOperand(0));
3026 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
3029 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3034 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3038 size_t NumOps = N->getNumOperands();
3039 size_t NumDims = NumOps - 6;
3040 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3044 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
3045 Ops.push_back(N->getOperand(0)); // Chain operand
3048 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
3051 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3056 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3057 // {src, dims{d0...dN}, im2col_offsets{dims-2}
3061 size_t NumOps = N->getNumOperands();
3062 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
3063 : (NumOps - 5);
3064 // Offsets is always 'NumDims - 2' and only for im2col mode
3065 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
3066 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3070 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
3071 Ops.push_back(N->getOperand(0)); // Chain operand
3075 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3081 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3085 size_t NumOps = N->getNumOperands();
3086 size_t NumDims = NumOps - 6;
3087 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3091 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
3093 Ops.push_back(N->getOperand(0)); // Chain operand
3096 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
3099 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3103 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3107 size_t NumOps = N->getNumOperands();
3108 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3112 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
3113 Ops.push_back(N->getOperand(0)); // Chain operand
3116 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
3119 Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
3120 : NVPTX::CP_ASYNC_BULK_S2G_CH;
3122 Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
3123 : NVPTX::CP_ASYNC_BULK_S2G;
3124 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3128 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3133 size_t NumOps = N->getNumOperands();
3134 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3135 bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
3140 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
3144 Ops.push_back(N->getOperand(MultiCastIdx));
3148 Ops.push_back(N->getOperand(MultiCastIdx + 1));
3151 Ops.push_back(N->getOperand(0));
3154 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
3157 return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC_CH
3158 : NVPTX::CP_ASYNC_BULK_G2S_MC_CH;
3160 return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC
3161 : NVPTX::CP_ASYNC_BULK_G2S_MC;
3163 return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_CH
3164 : NVPTX::CP_ASYNC_BULK_G2S_CH;
3165 return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32
3166 : NVPTX::CP_ASYNC_BULK_G2S;
3168 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3172 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3176 size_t NumOps = N->getNumOperands();
3177 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3181 SmallVector<SDValue, 4> Ops(N->ops().slice(2, NumArgs));
3182 Ops.push_back(N->getOperand(0)); // Chain operand
3185 ? NVPTX::CP_ASYNC_BULK_PREFETCH_CH
3186 : NVPTX::CP_ASYNC_BULK_PREFETCH;
3187 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3191 unsigned IID = N->getConstantOperandVal(1);