IntrinsicsAMDGPU.td - OpenGrok cross reference for /freebsd-src/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines Matching +full:2 +full:- +full:pixel +full:- +full:align
1 //===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines all of the R600-specific intrinsics.
11 //===----------------------------------------------------------------------===//
15 // The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
16 // by the backend cause whole-program undefined behavior when violated, such as
18 // values. In non-entry-point functions, attempting to call a function that needs
20 // of the calling convention and also program-level UB. Outside of such IR-level UB,
21 // these preloaded registers are always set to a well-defined value and are thus `noundef`.
70   // 2nd parameter: Index
144 //===----------------------------------------------------------------------===//
146 //===----------------------------------------------------------------------===//
154   [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
159   [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
164   [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
169   [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
172 // This is no longer guaranteed to be a compile-time constant due to linking
189   [Align<RetIndex, 4>, NoUndef<RetIndex>,
192 // Set EXEC to the 64-bit value given.
196   [llvm_i64_ty],      // 64-bit literal constant
206   [llvm_i32_ty,       // 32-bit SGPR input
221 //===----------------------------------------------------------------------===//
223 //===----------------------------------------------------------------------===//
288 //     MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
313   [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
338   // 2nd parameter: Denominator
343   [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
397 // Fused single-precision multiply-add with legacy behaviour for the multiply,
398 // which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
429 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
531   [LLVMQualPointerType<2>, // IntToPtr(M0)
537                 // gfx10: bits 24-27 indicate the number of active threads/dwords
541    ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
565 // New-style image intrinsics
568 // Dimension-aware image intrinsics framework
613   string Name = name; // e.g. "2darraymsaa"
614   string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings)
633 def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>;
638   def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>;
640 def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>;
642   def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>;
658   // {offset} {bias} {z-compare}
712 // Helper class to capture the profile of a dimension-aware image intrinsic.
834   int NumSampArgs = !if(P_.IsSample, 2, 0);
842   int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
851 // All dimension-aware intrinsics are derived from this class.
855     P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
857       !foreach(arg, P_.DataArgs, arg.Type),    // vdata(VGPR) -- for store/atomic
866                                                //                bit 2 = dlc (gfx10/gfx11),
869                                                //        gfx12+: bits [0-2] = th, bits [3-4] = scope
1100                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1103                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1106   [IntrNoMem, ImmArg<ArgIndex<2>>]>,
1130                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1133                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1148                       //                                       bit 2 = dlc on gfx10+),
1160                           //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1163                           //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1179                       //                                       bit 2 = dlc on gfx10+),
1192                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1195                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1210                           //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1213                           //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1229                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1232                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1247                           //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1250                           //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1267                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1270                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1286                           //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1289                           //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1331   AMDGPURsrcIntrinsic<2, 0>;
1367   [IntrArgMemOnly, NoCapture<ArgIndex<2>>,
1369   AMDGPURsrcIntrinsic<2, 0>;
1374 // Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx12+.
1410   AMDGPURsrcIntrinsic<2, 0>;
1445   [IntrArgMemOnly, NoCapture<ArgIndex<2>>,
1447   AMDGPURsrcIntrinsic<2, 0>;
1461 // - raw and struct variants
1462 // - joint format field
1463 // - joint cachepolicy field
1471                       //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1474                       //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1487                         //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1490                         //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1505                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1508                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1523                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1526                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1541                       //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1544                       //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1559                          //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1562                          //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1578                          //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1581                          //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1597                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1600                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1611    llvm_i32_ty,               // Data byte size: 1/2/4
1616                               //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1619                               //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1622   [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1630    llvm_i32_ty,               // Data byte size: 1/2/4
1635                               //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1638                               //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1644    ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1652    llvm_i32_ty,               // Data byte size: 1/2/4
1658                               //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1661                               //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1664   [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1672    llvm_i32_ty,               // Data byte size: 1/2/4
1678                               //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1681                               //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1687    ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1807 // unsafe to change in non-strictfp functions. The register properties
1822 // This intrinsic always returns PC sign-extended from 48 bits even if the
1823 // s_getpc_b64 instruction returns a zero-extended value.
1830 // param values: 0 = P10, 1 = P20, 2 = P0
1836               ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1846              ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1854              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1858 // high selects whether high or low 16-bits are loaded from LDS
1864              ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1867 // high selects whether high or low 16-bits are loaded from LDS
1873              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1905 // high selects whether high or low 16-bits are used for p and p0 operands
1913 // high selects whether high or low 16-bits are used for p operand
1921 // gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
1922 // high selects whether high or low 16-bits are used for p and p0 operands
1930 // gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
1931 // high selects whether high or low 16-bits are used for p operand
2034              ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2039              ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2071                           //                                          2. DPP)
2201 // Return true if at least one thread within the pixel quad passes true into
2224 // enabled, with a few exceptions: - Phi nodes which require WWM return an
2290                             // non-zero.
2295 //===----------------------------------------------------------------------===//
2297 //===----------------------------------------------------------------------===//
2307 //===----------------------------------------------------------------------===//
2309 //===----------------------------------------------------------------------===//
2317              ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
2329               ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
2362 //===----------------------------------------------------------------------===//
2364 //===----------------------------------------------------------------------===//
2372      llvm_i32_ty,                       // Data byte size: 1/2/4
2378      ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2386 //===----------------------------------------------------------------------===//
2388 //===----------------------------------------------------------------------===//
2405 // <sel> is a 32-bit constant whose high 8 bits must be zero which selects
2437 //===----------------------------------------------------------------------===//
2439 //===----------------------------------------------------------------------===//
2475 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2514     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
2521 // The content of the other 16-bit half is preserved from the input.
2535 //        The content of the other 16-bit half is undefined.
2541 //===----------------------------------------------------------------------===//
2543 //===----------------------------------------------------------------------===//
2559 // SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
2589     [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
2593 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2598 // A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
2638 // <2 x i32>    @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))  -> global_load_tr_b64
2639 // <8 x i16>    @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))  -> global_load_tr_b128
2641 // i32          @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1))    -> global_load_tr_b64
2642 // <4 x i16>    @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))  -> global_load_tr_b128
2651 //===----------------------------------------------------------------------===//
2653 //===----------------------------------------------------------------------===//
2744 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2759 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2777 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2791      ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2795 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2811 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2830 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2845      ImmArg<ArgIndex<0>>,  ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2849 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2867 //===----------------------------------------------------------------------===//
2869 // ===----------------------------------------------------------------------===//
2905 //===----------------------------------------------------------------------===//
2907 // ===----------------------------------------------------------------------===//
2929 //===----------------------------------------------------------------------===//
2931 // ===----------------------------------------------------------------------===//
2996 // word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
3009 // word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
3034 //===----------------------------------------------------------------------===//
3037 // ===----------------------------------------------------------------------===//
3039 // Control-flow intrinsics in LLVM IR are convergent because they represent the
3041 // lock-step". But they exist during a small window in the lowering process,