Lines Matching +full:4 +full:- +full:pixel +full:- +full:align
1 //===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines all of the R600-specific intrinsics.
11 //===----------------------------------------------------------------------===//
15 // The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
16 // by the backend cause whole-program undefined behavior when violated, such as
18 // values. In non-entry-point functions, attempting to call a function that needs
20 // of the calling convention and also program-level UB. Outside of such IR-level UB,
21 // these preloaded registers are always set to a well-defined value and are thus `noundef`.
144 //===----------------------------------------------------------------------===//
146 //===----------------------------------------------------------------------===//
153 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
154 [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
158 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
159 [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
163 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
164 [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
168 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
169 [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
172 // This is no longer guaranteed to be a compile-time constant due to linking
188 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
189 [Align<RetIndex, 4>, NoUndef<RetIndex>,
192 // Set EXEC to the 64-bit value given.
196 [llvm_i64_ty], // 64-bit literal constant
206 [llvm_i32_ty, // 32-bit SGPR input
221 //===----------------------------------------------------------------------===//
223 //===----------------------------------------------------------------------===//
288 // MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
356 // Look Up 2.0 / pi src0 with segment select src1[4:0]
397 // Fused single-precision multiply-add with legacy behaviour for the multiply,
398 // which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
429 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
537 // gfx10: bits 24-27 indicate the number of active threads/dwords
541 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
565 // New-style image intrinsics
568 // Dimension-aware image intrinsics framework
658 // {offset} {bias} {z-compare}
712 // Helper class to capture the profile of a dimension-aware image intrinsic.
842 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
851 // All dimension-aware intrinsics are derived from this class.
855 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return
857 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic
867 // bit 4 = scc (gfx90a)
868 // gfx940: bit 0 = sc0, bit 1 = nt, bit 4 = sc1
869 // gfx12+: bits [0-2] = th, bits [3-4] = scope
1101 // bit 3 = swz, bit 4 = scc (gfx90a)
1102 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1103 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1117 // The versions of these intrinsics that take <4 x i32> arguments are deprecated
1131 // bit 3 = swz, bit 4 = scc (gfx90a)
1132 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1133 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1161 // bit 3 = swz, bit 4 = scc (gfx90a)
1162 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1163 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1193 // bit 3 = swz, bit 4 = scc (gfx90a)
1194 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1195 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1198 [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1211 // bit 3 = swz, bit 4 = scc (gfx90a)
1212 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1213 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1217 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1230 // bit 3 = swz, bit 4 = scc (gfx90a)
1231 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1232 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1235 [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1248 // bit 3 = swz, bit 4 = scc (gfx90a)
1249 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1250 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1254 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1268 // bit 3 = swz, bit 4 = scc (gfx90a)
1269 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1270 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1287 // bit 3 = swz, bit 4 = scc (gfx90a)
1288 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1289 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1305 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1341 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1461 // - raw and struct variants
1462 // - joint format field
1463 // - joint cachepolicy field
1469 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1472 // bit 3 = swz, bit 4 = scc (gfx90a)
1473 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1474 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1477 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1485 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1488 // bit 3 = swz, bit 4 = scc (gfx90a)
1489 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1490 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1494 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1503 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1506 // bit 3 = swz, bit 4 = scc (gfx90a)
1507 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1508 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1512 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1521 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1524 // bit 3 = swz, bit 4 = scc (gfx90a)
1525 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1526 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1530 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1539 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1542 // bit 3 = swz, bit 4 = scc (gfx90a)
1543 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1544 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1548 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1557 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1560 // bit 3 = swz, bit 4 = scc (gfx90a)
1561 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1562 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1566 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1576 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1579 // bit 3 = swz, bit 4 = scc (gfx90a)
1580 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1581 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1595 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1598 // bit 3 = swz, bit 4 = scc (gfx90a)
1599 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1600 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1611 llvm_i32_ty, // Data byte size: 1/2/4
1617 // bit 3 = swz, bit 4 = scc (gfx90a)
1618 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1619 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1630 llvm_i32_ty, // Data byte size: 1/2/4
1636 // bit 3 = swz, bit 4 = scc (gfx90a)
1637 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1638 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1652 llvm_i32_ty, // Data byte size: 1/2/4
1659 // bit 3 = swz, bit 4 = scc (gfx90a)
1660 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1661 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1672 llvm_i32_ty, // Data byte size: 1/2/4
1679 // bit 3 = swz, bit 4 = scc (gfx90a)
1680 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1681 // gfx12+: bits [0-2] = th, bits [3-4] = scope,
1731 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>,
1807 // unsafe to change in non-strictfp functions. The register properties
1819 // not cross a 4Gb address boundary. Use for any other purpose may not
1822 // This intrinsic always returns PC sign-extended from 48 bits even if the
1823 // s_getpc_b64 instruction returns a zero-extended value.
1858 // high selects whether high or low 16-bits are loaded from LDS
1867 // high selects whether high or low 16-bits are loaded from LDS
1873 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1905 // high selects whether high or low 16-bits are used for p and p0 operands
1913 // high selects whether high or low 16-bits are used for p operand
1921 // gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
1922 // high selects whether high or low 16-bits are used for p and p0 operands
1930 // gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
1931 // high selects whether high or low 16-bits are used for p operand
2201 // Return true if at least one thread within the pixel quad passes true into
2224 // enabled, with a few exceptions: - Phi nodes which require WWM return an
2290 // non-zero.
2292 [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
2295 //===----------------------------------------------------------------------===//
2297 //===----------------------------------------------------------------------===//
2307 //===----------------------------------------------------------------------===//
2309 //===----------------------------------------------------------------------===//
2318 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2330 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2362 //===----------------------------------------------------------------------===//
2364 //===----------------------------------------------------------------------===//
2372 llvm_i32_ty, // Data byte size: 1/2/4
2376 // bit 4 = scc))
2378 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2386 //===----------------------------------------------------------------------===//
2388 //===----------------------------------------------------------------------===//
2395 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2402 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2405 // <sel> is a 32-bit constant whose high 8 bits must be zero which selects
2437 //===----------------------------------------------------------------------===//
2439 //===----------------------------------------------------------------------===//
2475 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2521 // The content of the other 16-bit half is preserved from the input.
2535 // The content of the other 16-bit half is undefined.
2541 //===----------------------------------------------------------------------===//
2543 //===----------------------------------------------------------------------===//
2550 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2557 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2559 // SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
2593 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2638 // <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64
2639 // <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128
2641 // i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64
2642 // <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128
2651 //===----------------------------------------------------------------------===//
2653 //===----------------------------------------------------------------------===//
2796 // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2812 // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2831 // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2867 //===----------------------------------------------------------------------===//
2869 // ===----------------------------------------------------------------------===//
2880 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
2905 //===----------------------------------------------------------------------===//
2907 // ===----------------------------------------------------------------------===//
2929 //===----------------------------------------------------------------------===//
2931 // ===----------------------------------------------------------------------===//
2948 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
3034 //===----------------------------------------------------------------------===//
3037 // ===----------------------------------------------------------------------===//
3039 // Control-flow intrinsics in LLVM IR are convergent because they represent the
3041 // lock-step". But they exist during a small window in the lowering process,