Lines Matching +full:2 +full:- +full:pixel +full:- +full:align

1 //===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines all of the R600-specific intrinsics.
11 //===----------------------------------------------------------------------===//
15 // The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
16 // by the backend cause whole-program undefined behavior when violated, such as
18 // values. In non-entry-point functions, attempting to call a function that needs
20 // of the calling convention and also program-level UB. Outside of such IR-level UB,
21 // these preloaded registers are always set to a well-defined value and are thus `noundef`.
70 // 2nd parameter: Index
144 //===----------------------------------------------------------------------===//
146 //===----------------------------------------------------------------------===//
154 [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
159 [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
164 [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
169 [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
172 // This is no longer guaranteed to be a compile-time constant due to linking
189 [Align<RetIndex, 4>, NoUndef<RetIndex>,
192 // Set EXEC to the 64-bit value given.
196 [llvm_i64_ty], // 64-bit literal constant
206 [llvm_i32_ty, // 32-bit SGPR input
221 //===----------------------------------------------------------------------===//
223 //===----------------------------------------------------------------------===//
288 // MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
313 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
338 // 2nd parameter: Denominator
343 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
397 // Fused single-precision multiply-add with legacy behaviour for the multiply,
398 // which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
429 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
531 [LLVMQualPointerType<2>, // IntToPtr(M0)
537 // gfx10: bits 24-27 indicate the number of active threads/dwords
541 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
565 // New-style image intrinsics
568 // Dimension-aware image intrinsics framework
613 string Name = name; // e.g. "2darraymsaa"
614 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings)
633 def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>;
638 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>;
640 def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>;
642 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>;
658 // {offset} {bias} {z-compare}
712 // Helper class to capture the profile of a dimension-aware image intrinsic.
834 int NumSampArgs = !if(P_.IsSample, 2, 0);
842 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
851 // All dimension-aware intrinsics are derived from this class.
855 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return
857 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic
866 // bit 2 = dlc (gfx10/gfx11),
869 // gfx12+: bits [0-2] = th, bits [3-4] = scope
1100 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1103 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1106 [IntrNoMem, ImmArg<ArgIndex<2>>]>,
1130 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1133 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1148 // bit 2 = dlc on gfx10+),
1160 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1163 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1179 // bit 2 = dlc on gfx10+),
1192 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1195 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1210 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1213 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1229 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1232 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1247 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1250 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1267 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1270 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1286 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1289 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1331 AMDGPURsrcIntrinsic<2, 0>;
1367 [IntrArgMemOnly, NoCapture<ArgIndex<2>>,
1369 AMDGPURsrcIntrinsic<2, 0>;
1374 // Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx12+.
1410 AMDGPURsrcIntrinsic<2, 0>;
1445 [IntrArgMemOnly, NoCapture<ArgIndex<2>>,
1447 AMDGPURsrcIntrinsic<2, 0>;
1461 // - raw and struct variants
1462 // - joint format field
1463 // - joint cachepolicy field
1471 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1474 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1487 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1490 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1505 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1508 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1523 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1526 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1541 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1544 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1559 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1562 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1578 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1581 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1597 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1600 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1611 llvm_i32_ty, // Data byte size: 1/2/4
1616 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1619 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1622 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1630 llvm_i32_ty, // Data byte size: 1/2/4
1635 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1638 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1644 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1652 llvm_i32_ty, // Data byte size: 1/2/4
1658 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1661 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1664 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1672 llvm_i32_ty, // Data byte size: 1/2/4
1678 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1681 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1687 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1807 // unsafe to change in non-strictfp functions. The register properties
1822 // This intrinsic always returns PC sign-extended from 48 bits even if the
1823 // s_getpc_b64 instruction returns a zero-extended value.
1830 // param values: 0 = P10, 1 = P20, 2 = P0
1836 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1846 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1854 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1858 // high selects whether high or low 16-bits are loaded from LDS
1864 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1867 // high selects whether high or low 16-bits are loaded from LDS
1873 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1905 // high selects whether high or low 16-bits are used for p and p0 operands
1913 // high selects whether high or low 16-bits are used for p operand
1921 // gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
1922 // high selects whether high or low 16-bits are used for p and p0 operands
1930 // gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
1931 // high selects whether high or low 16-bits are used for p operand
2034 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2039 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2071 // 2. DPP)
2201 // Return true if at least one thread within the pixel quad passes true into
2224 // enabled, with a few exceptions: - Phi nodes which require WWM return an
2290 // non-zero.
2295 //===----------------------------------------------------------------------===//
2297 //===----------------------------------------------------------------------===//
2307 //===----------------------------------------------------------------------===//
2309 //===----------------------------------------------------------------------===//
2317 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
2329 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
2362 //===----------------------------------------------------------------------===//
2364 //===----------------------------------------------------------------------===//
2372 llvm_i32_ty, // Data byte size: 1/2/4
2378 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2386 //===----------------------------------------------------------------------===//
2388 //===----------------------------------------------------------------------===//
2405 // <sel> is a 32-bit constant whose high 8 bits must be zero which selects
2437 //===----------------------------------------------------------------------===//
2439 //===----------------------------------------------------------------------===//
2475 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2514 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
2521 // The content of the other 16-bit half is preserved from the input.
2535 // The content of the other 16-bit half is undefined.
2541 //===----------------------------------------------------------------------===//
2543 //===----------------------------------------------------------------------===//
2559 // SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
2589 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
2593 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2598 // A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
2638 // <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64
2639 // <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128
2641 // i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64
2642 // <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128
2651 //===----------------------------------------------------------------------===//
2653 //===----------------------------------------------------------------------===//
2744 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2759 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2777 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2791 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2795 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2811 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2830 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2845 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2849 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2867 //===----------------------------------------------------------------------===//
2869 // ===----------------------------------------------------------------------===//
2905 //===----------------------------------------------------------------------===//
2907 // ===----------------------------------------------------------------------===//
2929 //===----------------------------------------------------------------------===//
2931 // ===----------------------------------------------------------------------===//
2996 // word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
3009 // word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
3034 //===----------------------------------------------------------------------===//
3037 // ===----------------------------------------------------------------------===//
3039 // Control-flow intrinsics in LLVM IR are convergent because they represent the
3041 // lock-step". But they exist during a small window in the lowering process,