1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines all of the R600-specific intrinsics. 10// 11//===----------------------------------------------------------------------===// 12 13def global_ptr_ty : LLVMQualPointerType<1>; 14def local_ptr_ty : LLVMQualPointerType<3>; 15 16// The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred 17// by the backend cause whole-program undefined behavior when violated, such as 18// by causing all other preload register intrinsics to return arbitrarily incorrect 19// values. In non-entry-point functions, attempting to call a function that needs 20// some preloaded register from a function that is known to not need it is a violation 21// of the calling convention and also program-level UB. Outside of such IR-level UB, 22// these preloaded registers are always set to a well-defined value and are thus `noundef`. 23class AMDGPUReadPreloadRegisterIntrinsic 24 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 25 26class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> 27 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>; 28 29// Used to tag image and resource intrinsics with information used to generate 30// mem operands. 31class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { 32 int RsrcArg = rsrcarg; 33 bit IsImage = isimage; 34} 35 36let TargetPrefix = "r600" in { 37 38multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { 39 def _x : AMDGPUReadPreloadRegisterIntrinsic; 40 def _y : AMDGPUReadPreloadRegisterIntrinsic; 41 def _z : AMDGPUReadPreloadRegisterIntrinsic; 42} 43 44multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { 45 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; 46 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; 47 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; 48} 49 50defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 51 <"__builtin_r600_read_global_size">; 52defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 53 <"__builtin_r600_read_ngroups">; 54defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 55 <"__builtin_r600_read_tgid">; 56 57defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; 58defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; 59 60def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, 61 Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; 62 63// AS 7 is PARAM_I_ADDRESS, used for kernel arguments 64def int_r600_implicitarg_ptr : 65 ClangBuiltin<"__builtin_r600_implicitarg_ptr">, 66 DefaultAttrsIntrinsic<[LLVMQualPointerType<7>], [], 67 [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 68 69def int_r600_rat_store_typed : 70 // 1st parameter: Data 71 // 2nd parameter: Index 72 // 3rd parameter: Constant RAT ID 73 DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, 74 ClangBuiltin<"__builtin_r600_rat_store_typed">; 75 76def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic< 77 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 78>; 79 80def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic< 81 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 82>; 83 84def int_r600_cube : DefaultAttrsIntrinsic< 85 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 86>; 87 88def int_r600_store_stream_output : DefaultAttrsIntrinsic< 89 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] 90>; 91 92class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [ 93 llvm_v4f32_ty, // Coord 94 llvm_i32_ty, // offset_x 95 llvm_i32_ty, // offset_y, 96 llvm_i32_ty, // offset_z, 97 llvm_i32_ty, // resource_id 98 llvm_i32_ty, // samplerid 99 llvm_i32_ty, // coord_type_x 100 llvm_i32_ty, // coord_type_y 101 llvm_i32_ty, // coord_type_z 102 llvm_i32_ty], // coord_type_w 103 [IntrNoMem] 104>; 105 106class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [ 107 llvm_v4i32_ty, // Coord 108 llvm_i32_ty, // offset_x 109 llvm_i32_ty, // offset_y, 110 llvm_i32_ty, // offset_z, 111 llvm_i32_ty, // resource_id 112 llvm_i32_ty, // samplerid 113 llvm_i32_ty, // coord_type_x 114 llvm_i32_ty, // coord_type_y 115 llvm_i32_ty, // coord_type_z 116 llvm_i32_ty], // coord_type_w 117 [IntrNoMem] 118>; 119 120def int_r600_store_swizzle : 121 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 122>; 123 124def int_r600_tex : TextureIntrinsicFloatInput; 125def int_r600_texc : TextureIntrinsicFloatInput; 126def int_r600_txl : TextureIntrinsicFloatInput; 127def int_r600_txlc : TextureIntrinsicFloatInput; 128def int_r600_txb : TextureIntrinsicFloatInput; 129def int_r600_txbc : TextureIntrinsicFloatInput; 130def int_r600_txf : TextureIntrinsicInt32Input; 131def int_r600_txq : TextureIntrinsicInt32Input; 132def int_r600_ddx : TextureIntrinsicFloatInput; 133def int_r600_ddy : TextureIntrinsicFloatInput; 134 135def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty], 136 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 137>; 138 139def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>; 140 141} // End TargetPrefix = "r600" 142 143let TargetPrefix = "amdgcn" in { 144 145//===----------------------------------------------------------------------===// 146// ABI Special Intrinsics 147//===----------------------------------------------------------------------===// 148 149defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; 150defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 151 <"__builtin_amdgcn_workgroup_id">; 152 153def int_amdgcn_dispatch_ptr : 154 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 155 [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>; 156 157def int_amdgcn_queue_ptr : 158 ClangBuiltin<"__builtin_amdgcn_queue_ptr">, 159 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 160 [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>; 161 162def int_amdgcn_kernarg_segment_ptr : 163 ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, 164 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 165 [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 166 167def int_amdgcn_implicitarg_ptr : 168 ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, 169 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 170 [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 171 172// Returns the amount of LDS statically allocated for this program. 173// This is no longer guaranteed to be a compile-time constant due to linking 174// support. 175def int_amdgcn_groupstaticsize : 176 ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, 177 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 178 179def int_amdgcn_dispatch_id : 180 ClangBuiltin<"__builtin_amdgcn_dispatch_id">, 181 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 182 183// For internal use. Coordinates LDS lowering between IR transform and backend. 184def int_amdgcn_lds_kernel_id : 185 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 186 187def int_amdgcn_implicit_buffer_ptr : 188 ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, 189 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 190 [Align<RetIndex, 4>, NoUndef<RetIndex>, 191 IntrNoMem, IntrSpeculatable]>; 192 193// Set EXEC to the 64-bit value given. 194// This is always moved to the beginning of the basic block. 195// FIXME: Should be mangled for wave size. 196def int_amdgcn_init_exec : Intrinsic<[], 197 [llvm_i64_ty], // 64-bit literal constant 198 [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback, 199 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>; 200 201// Set EXEC according to a thread count packed in an SGPR input: 202// thread_count = (input >> bitoffset) & 0x7f; 203// This is always moved to the beginning of the basic block. 204// Note: only inreg arguments to the parent function are valid as 205// inputs to this intrinsic, computed values cannot be used. 206def int_amdgcn_init_exec_from_input : Intrinsic<[], 207 [llvm_i32_ty, // 32-bit SGPR input 208 llvm_i32_ty], // bit offset of the thread count 209 [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback, 210 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 211 212// Sets the function into whole-wave-mode and returns whether the lane was 213// active when entering the function. A branch depending on this return will 214// revert the EXEC mask to what it was when entering the function, thus 215// resulting in a no-op. This pattern is used to optimize branches when function 216// tails need to be run in whole-wave-mode. It may also have other consequences 217// (mostly related to WWM CSR handling) that differentiate it from using 218// a plain `amdgcn.init.exec -1`. 219def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [ 220 IntrHasSideEffects, IntrNoMem, IntrConvergent]>; 221 222def int_amdgcn_wavefrontsize : 223 ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, 224 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 225 226// Represent a relocation constant. 227def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< 228 [llvm_i32_ty], [llvm_metadata_ty], 229 [IntrNoMem, IntrSpeculatable] 230>; 231 232//===----------------------------------------------------------------------===// 233// Instruction Intrinsics 234//===----------------------------------------------------------------------===// 235 236// The first parameter is s_sendmsg immediate (i16), 237// the second one is copied to m0 238def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, 239 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 240 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 241def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, 242 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 243 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 244 245 246// gfx11 intrinsic 247// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. 248def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], 249 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 250 251// Vanilla workgroup sync-barrier 252def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, 253 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 254 255// Lower-level split-barrier intrinsics 256 257// void @llvm.amdgcn.s.barrier.signal(i32 %barrierType) 258// only for non-named barrier 259def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">, 260 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 261 IntrNoCallback, IntrNoFree]>; 262 263// void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %barrier, i32 %memberCnt) 264// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined. 265def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, 266 Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 267 IntrNoCallback, IntrNoFree]>; 268 269// bool @llvm.amdgcn.s.barrier.signal.isfirst(i32 %barrierType) 270// only for non-named barrier 271def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">, 272 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 273 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 274 275// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt) 276// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined. 277def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, 278 Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, 279 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 280 281// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier) 282// The %barrier argument must be uniform, otherwise behavior is undefined. 283def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, 284 Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 285 IntrNoCallback, IntrNoFree]>; 286 287// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType) 288def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, 289 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 290 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 291 292// void @llvm.amdgcn.s.barrier.leave(i16 %barrierType) 293def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, 294 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 295 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 296 297// uint32_t @llvm.amdgcn.s.get.barrier.state(i32 %barrierId) 298// The %barrierType argument must be uniform, otherwise behavior is undefined. 299def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, 300 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 301 IntrNoCallback, IntrNoFree]>; 302 303// uint32_t @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %barrier) 304// The %barrier argument must be uniform, otherwise behavior is undefined. 305def int_amdgcn_s_get_named_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_named_barrier_state">, 306 Intrinsic<[llvm_i32_ty], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 307 IntrNoCallback, IntrNoFree]>; 308 309def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, 310 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 311 312// The 1st parameter is a mask for the types of instructions that may be allowed 313// to cross the SCHED_BARRIER during scheduling. 314// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. 315// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be 316// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. 317// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. 318// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. 319// MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER. 320// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. 321// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. 322// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. 323// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. 324// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. 325// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. 326def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, 327 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 328 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 329 330// The first parameter is a mask that determines the types of instructions that 331// you would like to synchronize around and add to a scheduling group. The 332// values of the mask are defined above for sched_barrier. These instructions 333// will be selected from the bottom up starting from the sched_group_barrier's 334// location during instruction scheduling. The second parameter is the number of 335// matching instructions that will be associated with this sched_group_barrier. 336// The third parameter is an identifier which is used to describe what other 337// sched_group_barriers should be synchronized with. 338def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, 339 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 340 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, 341 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 342 343// Scheduler optimization hint. 344// MASK = 0: Small gemm opt 345def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, 346 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 347 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 348 349def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, 350 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 351 352// GFX12 intrinsics 353class AMDGPUWaitIntrinsic : 354 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 355def int_amdgcn_s_wait_bvhcnt : AMDGPUWaitIntrinsic; 356def int_amdgcn_s_wait_dscnt : AMDGPUWaitIntrinsic; 357def int_amdgcn_s_wait_expcnt : AMDGPUWaitIntrinsic; 358def int_amdgcn_s_wait_kmcnt : AMDGPUWaitIntrinsic; 359def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic; 360def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic; 361def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic; 362 363def int_amdgcn_div_scale : DefaultAttrsIntrinsic< 364 // 1st parameter: Numerator 365 // 2nd parameter: Denominator 366 // 3rd parameter: Select quotient. Must equal Numerator or Denominator. 367 // (0 = Denominator, 1 = Numerator). 368 [llvm_anyfloat_ty, llvm_i1_ty], 369 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 370 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] 371>; 372 373def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 374 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 375 [IntrNoMem, IntrSpeculatable] 376>; 377 378def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 379 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 380 [IntrNoMem, IntrSpeculatable] 381>; 382 383// Look Up 2.0 / pi src0 with segment select src1[4:0] 384def int_amdgcn_trig_preop : DefaultAttrsIntrinsic< 385 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 386 [IntrNoMem, IntrSpeculatable] 387>; 388 389def int_amdgcn_sin : DefaultAttrsIntrinsic< 390 [llvm_anyfloat_ty], [LLVMMatchType<0>], 391 [IntrNoMem, IntrSpeculatable] 392>; 393 394def int_amdgcn_cos : DefaultAttrsIntrinsic< 395 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 396>; 397 398// v_log_{f16|f32}, performs log2. f32 version does not handle 399// denormals. There is no reason to use this for f16 as it does 400// support denormals, and the generic log2 intrinsic should be 401// preferred. 402def int_amdgcn_log : DefaultAttrsIntrinsic< 403 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 404>; 405 406// v_exp_{f16|f32} (int_amdgcn_exp was taken by export 407// already). Performs exp2. f32 version does not handle 408// denormals. There is no reason to use this for f16 as it does 409// support denormals, and the generic exp2 intrinsic should be 410// preferred. 411def int_amdgcn_exp2 : DefaultAttrsIntrinsic< 412 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 413>; 414 415def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< 416 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 417>; 418 419def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, 420 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], 421 [IntrNoMem, IntrSpeculatable, Commutative] 422>; 423 424// Fused single-precision multiply-add with legacy behaviour for the multiply, 425// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is 426// intended for use on subtargets that have the v_fma_legacy_f32 and/or 427// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and 428// has a completely different kind of legacy behaviour.) 429def int_amdgcn_fma_legacy : 430 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], 431 [IntrNoMem, IntrSpeculatable, Commutative] 432>; 433 434def int_amdgcn_rcp : DefaultAttrsIntrinsic< 435 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 436>; 437 438def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, 439 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], 440 [IntrNoMem, IntrSpeculatable] 441>; 442 443def int_amdgcn_sqrt : DefaultAttrsIntrinsic< 444 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 445>; 446 447def int_amdgcn_rsq : DefaultAttrsIntrinsic< 448 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 449>; 450 451def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, 452 DefaultAttrsIntrinsic< 453 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] 454>; 455 456// out = 1.0 / sqrt(a) result clamped to +/- max_float. 457def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic< 458 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; 459 460def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic< 461 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 462>; 463 464def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic< 465 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] 466>; 467 468// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 469// and always uses rtz, so is not suitable for implementing the OpenCL 470// fract function. It should be ok on VI. 471def int_amdgcn_fract : DefaultAttrsIntrinsic< 472 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 473>; 474 475def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, 476 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], 477 [IntrNoMem, IntrSpeculatable] 478>; 479 480def int_amdgcn_cvt_pknorm_i16 : 481 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, 482 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 483 [IntrNoMem, IntrSpeculatable] 484>; 485 486def int_amdgcn_cvt_pknorm_u16 : 487 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, 488 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 489 [IntrNoMem, IntrSpeculatable] 490>; 491 492def int_amdgcn_cvt_pk_i16 : 493 ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, 494 DefaultAttrsIntrinsic< 495 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 496 [IntrNoMem, IntrSpeculatable] 497>; 498 499def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, 500 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 501 [IntrNoMem, IntrSpeculatable] 502>; 503 504def int_amdgcn_class : DefaultAttrsIntrinsic< 505 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], 506 [IntrNoMem, IntrSpeculatable] 507>; 508 509def int_amdgcn_fmed3 : 510 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 511 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 512 [IntrNoMem, IntrSpeculatable] 513>; 514 515def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, 516 DefaultAttrsIntrinsic<[llvm_float_ty], 517 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 518 [IntrNoMem, IntrSpeculatable] 519>; 520 521def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, 522 DefaultAttrsIntrinsic<[llvm_float_ty], 523 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 524 [IntrNoMem, IntrSpeculatable] 525>; 526 527def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, 528 DefaultAttrsIntrinsic<[llvm_float_ty], 529 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 530 [IntrNoMem, IntrSpeculatable] 531>; 532 533def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, 534 DefaultAttrsIntrinsic<[llvm_float_ty], 535 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 536 [IntrNoMem, IntrSpeculatable] 537>; 538 539// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz 540// should be used. 541def int_amdgcn_sffbh : 542 DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], 543 [IntrNoMem, IntrSpeculatable] 544>; 545 546// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. 547def int_amdgcn_fmad_ftz : 548 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 549 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 550 [IntrNoMem, IntrSpeculatable] 551>; 552 553// FIXME: The m0 argument should be moved after the normal arguments 554class AMDGPUDSOrderedIntrinsic : Intrinsic< 555 [llvm_i32_ty], 556 // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that 557 // the bit packing can be optimized at the IR level. 558 [LLVMQualPointerType<2>, // IntToPtr(M0) 559 llvm_i32_ty, // value to add or swap 560 llvm_i32_ty, // ordering 561 llvm_i32_ty, // scope 562 llvm_i1_ty, // isVolatile 563 llvm_i32_ty, // ordered count index (OA index), also added to the address 564 // gfx10: bits 24-27 indicate the number of active threads/dwords 565 llvm_i1_ty, // wave release, usually set to 1 566 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction 567 [IntrWillReturn, NoCapture<ArgIndex<0>>, 568 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 569 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree 570 ] 571>; 572 573class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< 574 [llvm_i32_ty], 575 [llvm_anyptr_ty, // LDS or GDS ptr 576 llvm_i1_ty], // isVolatile 577 [IntrConvergent, IntrWillReturn, IntrArgMemOnly, 578 Align<ArgIndex<0>, 4>, NoCapture<ArgIndex<0>>, 579 ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree], 580 "", 581 [SDNPMemOperand] 582>; 583 584def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; 585def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; 586 587// The pointer argument is assumed to be dynamically uniform if a VGPR. 588def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; 589def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; 590 591class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< 592 [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] 593>, ClangBuiltin<"__builtin_amdgcn_"#name>; 594 595class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic< 596 [DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] 597>, ClangBuiltin<"__builtin_amdgcn_"#name>; 598 599class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< 600 [DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] 601>, ClangBuiltin<"__builtin_amdgcn_"#name>; 602 603def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">; 604def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">; 605def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">; 606def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_bf6_bf16">; 607def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">; 608def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">; 609 610def int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_bf6_bf16">; 611def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_bf6_f16">; 612def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_bf6_f32">; 613def int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_fp6_bf16">; 614def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_fp6_f16">; 615def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">; 616 617class AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic< 618 [DstTy], 619 [llvm_i32_ty, // src 620 llvm_float_ty, // scale 621 llvm_i32_ty], // src_sel index [0..3] 622 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<2>>] 623>, ClangBuiltin<"__builtin_amdgcn_"#name>; 624 625class AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic< 626 [DstTy], 627 [llvm_i32_ty, // src 628 llvm_float_ty, // scale 629 llvm_i1_ty], // src_lo_hi_sel[true false] 630 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] 631>, ClangBuiltin<"__builtin_amdgcn_"#name>; 632 633class AMDGPUCvtScaleF16BF16ToFP8BF8TiedInputIntrinsic<LLVMType SrcTy, string name> : DefaultAttrsIntrinsic< 634 [llvm_v2i16_ty], 635 [llvm_v2i16_ty, // old_vdst 636 SrcTy, // src 637 llvm_float_ty, // scale 638 llvm_i1_ty], // dst_lo_hi_sel[true false] 639 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 640>, ClangBuiltin<"__builtin_amdgcn_"#name>; 641 642class AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<string name> : DefaultAttrsIntrinsic< 643 [llvm_v2i16_ty], 644 [llvm_v2i16_ty, // old_vdst 645 llvm_float_ty, // src0 646 llvm_float_ty, // src1 647 llvm_float_ty, // scale 648 llvm_i1_ty], // dst_lo_hi_sel[true false] 649 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<4>>] 650>, ClangBuiltin<"__builtin_amdgcn_"#name>; 651 652class AMDGPUCvtScaleFP8BF8ToF16TiedInputIntrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic< 653 [DstTy], 654 [llvm_v2f16_ty, // old_vdst 655 llvm_i32_ty, // src 656 llvm_float_ty, // scale 657 llvm_i32_ty, // src_sel_index[0..3] 658 llvm_i1_ty], // dst_lo_hi_sel[true false] 659 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>] 660>, ClangBuiltin<"__builtin_amdgcn_"#name>; 661 662class AMDGPUCvtScaleF32ToFP4Intrinsic<string name> : DefaultAttrsIntrinsic< 663 [llvm_i32_ty], 664 [llvm_i32_ty, // old_vdst 665 llvm_float_ty, // src0 666 llvm_float_ty, // src1 667 llvm_float_ty, // scale 668 llvm_i32_ty], // dst_sel_index[0..3] 669 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<4>>] 670>, ClangBuiltin<"__builtin_amdgcn_"#name>; 671 672class AMDGPUCvtScaleF16ToFP4TiedInputIntrinsic<LLVMType SrcTy, string name> : DefaultAttrsIntrinsic< 673 [llvm_i32_ty], 674 [llvm_i32_ty, // old_vdst 675 SrcTy, // src 676 llvm_float_ty, // scale 677 llvm_i32_ty], // dest_sel_index [0..3] 678 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>] 679>, ClangBuiltin<"__builtin_amdgcn_"#name>; 680 681class AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< 682 [llvm_i32_ty], 683 [llvm_i32_ty, // old_vdst 684 Src0Ty, // src0 685 llvm_i32_ty, // seed 686 llvm_float_ty, // scale 687 llvm_i32_ty], // dst_sel_index[0..3] 688 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<4>>] 689>, ClangBuiltin<"__builtin_amdgcn_"#name>; 690 691class AMDGPUCvtScaleSRF32ToBF16F16TiedInputIntrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic< 692 [DstTy], 693 [DstTy, // old_vdst 694 llvm_float_ty, // src0 695 llvm_i32_ty, // seed 696 llvm_i1_ty], // dst_lo_hi_sel[true false] 697 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>] 698>, ClangBuiltin<"__builtin_amdgcn_"#name>; 699 700def int_amdgcn_cvt_sr_bf16_f32: AMDGPUCvtScaleSRF32ToBF16F16TiedInputIntrinsic<llvm_v2bf16_ty, "cvt_sr_bf16_f32">; 701def int_amdgcn_cvt_sr_f16_f32 : AMDGPUCvtScaleSRF32ToBF16F16TiedInputIntrinsic<llvm_v2f16_ty, "cvt_sr_f16_f32">; 702 703// llvm.amdgcn.cvt.scalef32.fp16.fp8 v2f16 old_vdst, int src, float scale, int src_sel_index [0..3], bool dst_lo_hi_sel 704def int_amdgcn_cvt_scalef32_f16_fp8 : AMDGPUCvtScaleFP8BF8ToF16TiedInputIntrinsic<llvm_v2f16_ty, "cvt_scalef32_f16_fp8">; 705def int_amdgcn_cvt_scalef32_f16_bf8 : AMDGPUCvtScaleFP8BF8ToF16TiedInputIntrinsic<llvm_v2f16_ty, "cvt_scalef32_f16_bf8">; 706 707// llvm.amdgcn.cvt.scalef32.f32.fp8 int src, float scale, int src_sel_index [0..3] 708def int_amdgcn_cvt_scalef32_f32_fp8 : AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<llvm_float_ty, "cvt_scalef32_f32_fp8">; 709def int_amdgcn_cvt_scalef32_f32_bf8 : AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<llvm_float_ty, "cvt_scalef32_f32_bf8">; 710 711// llvm.amdgcn.cvt.scalef32.pk.fp8.f32 v2i16 old_vdst, float srcA, float srcB, float scale, bool dst_lo_hi_sel 712def int_amdgcn_cvt_scalef32_pk_fp8_f32 : AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<"cvt_scalef32_pk_fp8_f32">; 713def int_amdgcn_cvt_scalef32_pk_bf8_f32 : AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<"cvt_scalef32_pk_bf8_f32">; 714 715// llvm.amdgcn.cvt.scalef32.pk.fp32.fp8 int src, float scale, bool src_lo_hi_sel 716def int_amdgcn_cvt_scalef32_pk_f32_fp8 : AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<llvm_v2f32_ty, "cvt_scalef32_pk_f32_fp8">; 717def int_amdgcn_cvt_scalef32_pk_f32_bf8 : AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<llvm_v2f32_ty, "cvt_scalef32_pk_f32_bf8">; 718 719// llvm.amdgcn.cvt.scalef32.fp8.fp16 v2i16 old_vdst, v2f16 src, float scale, bool dst_lo_hi_sel 720def int_amdgcn_cvt_scalef32_pk_fp8_f16 : AMDGPUCvtScaleF16BF16ToFP8BF8TiedInputIntrinsic<llvm_v2f16_ty, "cvt_scalef32_pk_fp8_f16">; 721def int_amdgcn_cvt_scalef32_pk_fp8_bf16: AMDGPUCvtScaleF16BF16ToFP8BF8TiedInputIntrinsic<llvm_v2bf16_ty, "cvt_scalef32_pk_fp8_bf16">; 722def int_amdgcn_cvt_scalef32_pk_bf8_f16 : AMDGPUCvtScaleF16BF16ToFP8BF8TiedInputIntrinsic<llvm_v2f16_ty, "cvt_scalef32_pk_bf8_f16">; 723def int_amdgcn_cvt_scalef32_pk_bf8_bf16: AMDGPUCvtScaleF16BF16ToFP8BF8TiedInputIntrinsic<llvm_v2bf16_ty, "cvt_scalef32_pk_bf8_bf16">; 724 725// llvm.amdgcn.cvt.scalef32.pk.f32.fp4 int src, float scale, int src_sel_index [0..3] 726def int_amdgcn_cvt_scalef32_pk_f32_fp4 : AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<llvm_v2f32_ty, "cvt_scalef32_pk_f32_fp4">; 727 728// llvm.amdgcn.cvt.scalef32.pk.fp4.f32 i32 old_vdst, float srcA, float srcB, float scale, int dst_sel_index[0..3] 729def int_amdgcn_cvt_scalef32_pk_fp4_f32 : AMDGPUCvtScaleF32ToFP4Intrinsic<"cvt_scalef32_pk_fp4_f32">; 730 731// llvm.amdgcn.cvt.scalef32.pk.f32.fp4 int src, float scale, int src_sel_index [0..3] 732def int_amdgcn_cvt_scalef32_pk_f16_fp4 : AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<llvm_v2f16_ty, "cvt_scalef32_pk_f16_fp4">; 733def int_amdgcn_cvt_scalef32_pk_bf16_fp4: AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<llvm_v2bf16_ty, "cvt_scalef32_pk_bf16_fp4">; 734 735// llvm.amdgcn.cvt.scalef32.pk32.f32.fp6 v6i32 src, float scale 736def int_amdgcn_cvt_scalef32_pk32_f32_fp6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32f32_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_f32_fp6">; 737def int_amdgcn_cvt_scalef32_pk32_f32_bf6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32f32_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_f32_bf6">; 738 739// llvm.amdgcn.cvt.scalef32.pk32.f16.fp6 v6i32 src, float scale 740def int_amdgcn_cvt_scalef32_pk32_f16_bf6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32f16_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_f16_bf6">; 741def int_amdgcn_cvt_scalef32_pk32_bf16_bf6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32bf16_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_bf16_bf6">; 742def int_amdgcn_cvt_scalef32_pk32_f16_fp6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32f16_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_f16_fp6">; 743def int_amdgcn_cvt_scalef32_pk32_bf16_fp6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32bf16_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_bf16_fp6">; 744 745// llvm.amdgcn.cvt.scalef32.pk.fp16.fp8 int src, float scale, bool src_lo_hi_sel 746def int_amdgcn_cvt_scalef32_pk_f16_bf8 : AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<llvm_v2f16_ty, "cvt_scalef32_pk_f16_bf8">; 747def int_amdgcn_cvt_scalef32_pk_bf16_bf8 : AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<llvm_v2bf16_ty, "cvt_scalef32_pk_bf16_bf8">; 748def int_amdgcn_cvt_scalef32_pk_f16_fp8 : AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<llvm_v2f16_ty, "cvt_scalef32_pk_f16_fp8">; 749def int_amdgcn_cvt_scalef32_pk_bf16_fp8 : AMDGPUCvtScale_pk_FromFP8BF8Intrinsic<llvm_v2bf16_ty, "cvt_scalef32_pk_bf16_fp8">; 750 751// llvm.amdgcn.cvt.scalef32.pk.fp4.f16 int src, float scale, int dst_sel_index [0..3] 752def int_amdgcn_cvt_scalef32_pk_fp4_f16 : AMDGPUCvtScaleF16ToFP4TiedInputIntrinsic<llvm_v2f16_ty, "cvt_scalef32_pk_fp4_f16">; 753def int_amdgcn_cvt_scalef32_pk_fp4_bf16: AMDGPUCvtScaleF16ToFP4TiedInputIntrinsic<llvm_v2bf16_ty, "cvt_scalef32_pk_fp4_bf16">; 754 755def int_amdgcn_cvt_scalef32_sr_pk_fp4_f16: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_v2f16_ty, "cvt_scalef32_sr_pk_fp4_f16">; 756def int_amdgcn_cvt_scalef32_sr_pk_fp4_bf16: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_v2bf16_ty, "cvt_scalef32_sr_pk_fp4_bf16">; 757def int_amdgcn_cvt_scalef32_sr_pk_fp4_f32: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_v2f32_ty, "cvt_scalef32_sr_pk_fp4_f32">; 758def int_amdgcn_cvt_scalef32_sr_bf8_bf16: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_bfloat_ty, "cvt_scalef32_sr_bf8_bf16">; 759def int_amdgcn_cvt_scalef32_sr_bf8_f16: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_half_ty, "cvt_scalef32_sr_bf8_f16">; 760def int_amdgcn_cvt_scalef32_sr_bf8_f32: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_float_ty, "cvt_scalef32_sr_bf8_f32">; 761def int_amdgcn_cvt_scalef32_sr_fp8_bf16: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_bfloat_ty, "cvt_scalef32_sr_fp8_bf16">; 762def int_amdgcn_cvt_scalef32_sr_fp8_f16: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_half_ty, "cvt_scalef32_sr_fp8_f16">; 763def int_amdgcn_cvt_scalef32_sr_fp8_f32: AMDGPUCvtScaleBF16F16F32SRToFP4BF8F8TiedInputIntrinsic<llvm_float_ty, "cvt_scalef32_sr_fp8_f32">; 764 765def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic< 766 [llvm_i32_ty], [llvm_i32_ty], [IntrNoMem] 767>, ClangBuiltin<"__builtin_amdgcn_prng_b32">; 768 769def int_amdgcn_bitop3 : 770 DefaultAttrsIntrinsic<[llvm_anyint_ty], 771 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], 772 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]>; 773 774} // TargetPrefix = "amdgcn" 775 776// New-style image intrinsics 777 778////////////////////////////////////////////////////////////////////////// 779// Dimension-aware image intrinsics framework 780////////////////////////////////////////////////////////////////////////// 781 782// Helper class to represent (type, name) combinations of arguments. The 783// argument names are explanatory and used as DAG operand names for codegen 784// pattern matching. 785class AMDGPUArg<LLVMType ty, string name> { 786 LLVMType Type = ty; 787 string Name = name; 788} 789 790// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] 791class makeArgList<list<string> names, LLVMType basety> { 792 list<AMDGPUArg> ret = 793 !listconcat([AMDGPUArg<basety, names[0]>], 794 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); 795} 796 797// Return arglist, with LLVMMatchType's references shifted by 'shift'. 798class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { 799 list<AMDGPUArg> ret = 800 !foreach(arg, arglist, 801 !if(!isa<LLVMMatchType>(arg.Type), 802 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, 803 arg.Name>, 804 arg)); 805} 806 807// Return the concatenation of the given arglists. LLVMMatchType's are adjusted 808// accordingly, and shifted by an additional 'shift'. 809class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { 810 list<AMDGPUArg> ret = 811 !foldl([]<AMDGPUArg>, arglists, lhs, rhs, 812 !listconcat( 813 lhs, 814 arglistmatchshift<rhs, 815 !add(shift, !foldl(0, lhs, a, b, 816 !add(a, b.Type.isAny)))>.ret)); 817} 818 819// Represent texture/image types / dimensionality. 820class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, 821 list<string> coord_names, list<string> slice_names, 822 bit msaa = 0> { 823 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); 824 string Name = name; // e.g. "2darraymsaa" 825 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) 826 bits<3> Encoding = enc; 827 bit DA = 0; // DA bit in MIMG encoding 828 bit MSAA = msaa; 829 830 list<AMDGPUArg> CoordSliceArgs = 831 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; 832 list<AMDGPUArg> CoordSliceIntArgs = 833 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; 834 list<AMDGPUArg> GradientArgs = 835 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), 836 !foreach(name, coord_names, "d" # name # "dv")), 837 llvm_anyfloat_ty>.ret; 838 839 bits<8> NumCoords = !size(CoordSliceArgs); 840 bits<8> NumGradients = !size(GradientArgs); 841} 842 843def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; 844def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; 845def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; 846let DA = 1 in { 847 def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; 848 def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; 849 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; 850} 851def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; 852let DA = 1 in { 853 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; 854} 855 856def AMDGPUDims { 857 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, 858 AMDGPUDimCube, AMDGPUDim1DArray, 859 AMDGPUDim2DArray]; 860 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; 861 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); 862} 863 864// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. 865class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { 866 string UpperCaseMod = ucmod; 867 string LowerCaseMod = lcmod; 868 869 // {offset} {bias} {z-compare} 870 list<AMDGPUArg> ExtraAddrArgs = extra_addr; 871 bit Offset = false; 872 bit Bias = false; 873 bit ZCompare = false; 874 bit Gradients = false; 875 876 // Name of the {lod} or {clamp} argument that is appended to the coordinates, 877 // if any. 878 string LodOrClamp = ""; 879 880 bit UsesWQM = false; 881} 882 883// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE 884// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 885defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { 886 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, 887 list<AMDGPUArg> extra_addr> { 888 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; 889 let Offset = true in 890 def NAME#lcmod#_o : AMDGPUSampleVariant< 891 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; 892 } 893 894 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, 895 list<AMDGPUArg> extra_addr> { 896 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; 897 let ZCompare = true in 898 defm NAME : AMDGPUSampleHelper_Offset< 899 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; 900 } 901 902 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, 903 list<AMDGPUArg> extra_addr> { 904 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; 905 let LodOrClamp = "clamp" in 906 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; 907 } 908 909 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { 910 let UsesWQM = true in 911 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; 912 let Bias = true, UsesWQM = true in 913 defm AMDGPUSample : AMDGPUSampleHelper_Clamp< 914 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; 915 let LodOrClamp = "lod" in 916 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; 917 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; 918 } 919 920 let Gradients = true in { 921 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; 922 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; 923 } 924} 925 926// Helper class to capture the profile of a dimension-aware image intrinsic. 927// This information is used to generate the intrinsic's type and to inform 928// codegen pattern matching. 929class AMDGPUDimProfile<string opmod, 930 AMDGPUDimProps dim> { 931 AMDGPUDimProps Dim = dim; 932 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod 933 934 // These are intended to be overwritten by subclasses 935 bit IsSample = false; 936 bit IsAtomic = false; 937 list<LLVMType> RetTypes = []; 938 list<AMDGPUArg> DataArgs = []; 939 list<AMDGPUArg> ExtraAddrArgs = []; 940 bit Offset = false; 941 bit Bias = false; 942 bit ZCompare = false; 943 bit Gradients = false; 944 string LodClampMip = ""; 945 946 int NumRetAndDataAnyTypes = 947 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, 948 !add(a, b.isAny)); 949 950 list<AMDGPUArg> AddrArgs = 951 arglistconcat<[ExtraAddrArgs, 952 !if(Gradients, dim.GradientArgs, []), 953 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), 954 !if(!empty(LodClampMip), 955 []<AMDGPUArg>, 956 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], 957 NumRetAndDataAnyTypes>.ret; 958 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); 959 list<AMDGPUArg> AddrDefaultArgs = 960 !foreach(arg, AddrArgs, 961 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 962 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), 963 arg.Name>); 964 list<AMDGPUArg> AddrA16Args = 965 !foreach(arg, AddrArgs, 966 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 967 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), 968 arg.Name>); 969} 970 971class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { 972 let IsSample = base.IsSample; 973 let IsAtomic = base.IsAtomic; 974 let RetTypes = base.RetTypes; 975 let DataArgs = base.DataArgs; 976 let ExtraAddrArgs = base.ExtraAddrArgs; 977 let Offset = base.Offset; 978 let Bias = base.Bias; 979 let ZCompare = base.ZCompare; 980 let Gradients = base.Gradients; 981 let LodClampMip = base.LodClampMip; 982} 983 984class AMDGPUDimSampleProfile<string opmod, 985 AMDGPUDimProps dim, 986 AMDGPUSampleVariant sample, 987 bit has_return = true> : AMDGPUDimProfile<opmod, dim> { 988 let IsSample = true; 989 let RetTypes = !if(has_return, [llvm_any_ty], []); 990 let ExtraAddrArgs = sample.ExtraAddrArgs; 991 let Offset = sample.Offset; 992 let Bias = sample.Bias; 993 let ZCompare = sample.ZCompare; 994 let Gradients = sample.Gradients; 995 let LodClampMip = sample.LodOrClamp; 996} 997 998class AMDGPUDimSampleNoReturnProfile<string opmod, 999 AMDGPUDimProps dim, 1000 AMDGPUSampleVariant sample> 1001 : AMDGPUDimSampleProfile<opmod, dim, sample, false> { 1002} 1003 1004class AMDGPUDimNoSampleProfile<string opmod, 1005 AMDGPUDimProps dim, 1006 list<LLVMType> retty, 1007 list<AMDGPUArg> dataargs, 1008 bit Mip = false> : AMDGPUDimProfile<opmod, dim> { 1009 let RetTypes = retty; 1010 let DataArgs = dataargs; 1011 let LodClampMip = !if(Mip, "mip", ""); 1012} 1013 1014class AMDGPUDimAtomicProfile<string opmod, 1015 AMDGPUDimProps dim, 1016 list<AMDGPUArg> dataargs, 1017 LLVMType rettype> : AMDGPUDimProfile<opmod, dim> { 1018 let RetTypes = [rettype]; 1019 let DataArgs = dataargs; 1020 let IsAtomic = true; 1021} 1022 1023class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> 1024 : AMDGPUDimProfile<"GET_RESINFO", dim> { 1025 let RetTypes = [llvm_anyfloat_ty]; 1026 let DataArgs = []; 1027 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; 1028 let LodClampMip = "mip"; 1029} 1030 1031// Helper class for figuring out image intrinsic argument indexes. 1032class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { 1033 int NumDataArgs = !size(P_.DataArgs); 1034 int NumDmaskArgs = !not(P_.IsAtomic); 1035 int NumOffsetArgs = !if(P_.Offset, 1, 0); 1036 int NumBiasArgs = !if(P_.Bias, 1, 0); 1037 int NumZCompareArgs = !if(P_.ZCompare, 1, 0); 1038 int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); 1039 int NumVAddrArgs = !size(P_.AddrArgs); 1040 int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); 1041 int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); 1042 int NumRSrcArgs = 1; 1043 int NumSampArgs = !if(P_.IsSample, 2, 0); 1044 int DmaskArgIndex = NumDataArgs; 1045 int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); 1046 int OffsetArgIndex = VAddrArgIndex; 1047 int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); 1048 int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); 1049 int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); 1050 int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); 1051 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); 1052 int MipArgIndex = LodArgIndex; 1053 int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); 1054 int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); 1055 int UnormArgIndex = !add(SampArgIndex, 1); 1056 int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); 1057 int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); 1058} 1059 1060// All dimension-aware intrinsics are derived from this class. 1061class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, 1062 list<IntrinsicProperty> props, 1063 list<SDNodeProperty> sdnodeprops> : Intrinsic< 1064 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return 1065 !listconcat( 1066 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic 1067 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) 1068 P_.AddrTypes, // vaddr(VGPR) 1069 [llvm_any_ty], // rsrc(SGPR); Valid types: v4i32 and v8i32 1070 !if(P_.IsSample, [llvm_any_ty, // samp(SGPR); 1071 llvm_i1_ty], []), // unorm(imm) 1072 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) 1073 llvm_i32_ty]), // auxiliary/cachepolicy(imm): 1074 // bit 0 = glc, bit 1 = slc, 1075 // bit 2 = dlc (gfx10/gfx11), 1076 // bit 4 = scc (gfx90a) 1077 // gfx940: bit 0 = sc0, bit 1 = nt, bit 4 = sc1 1078 // gfx12+: bits [0-2] = th, bits [3-4] = scope 1079 !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn], 1080 !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), 1081 !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), 1082 [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, 1083 ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>], 1084 !if(P_.IsAtomic, [], [IntrNoSync])), 1085 1086 1087 "", sdnodeprops>, 1088 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), 1089 !if(P_.IsAtomic, 0, 1)), 1> { 1090 AMDGPUDimProfile P = P_; 1091 1092 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); 1093 1094 let TargetPrefix = "amdgcn"; 1095} 1096 1097// Marker class for intrinsics with a DMask that determines the returned 1098// channels. 1099class AMDGPUImageDMaskIntrinsic; 1100 1101defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { 1102 1103 ////////////////////////////////////////////////////////////////////////// 1104 // Load and store intrinsics 1105 ////////////////////////////////////////////////////////////////////////// 1106 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, 1107 list<LLVMType> retty, 1108 list<AMDGPUArg> dataargs, 1109 list<IntrinsicProperty> props, 1110 list<SDNodeProperty> sdnodeprops, 1111 bit Mip = false> { 1112 foreach dim = AMDGPUDims.NoMsaa in { 1113 def !strconcat(NAME, "_", dim.Name) 1114 : AMDGPUImageDimIntrinsic< 1115 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 1116 props, sdnodeprops>; 1117 } 1118 } 1119 1120 multiclass AMDGPUImageDimIntrinsicsAll<string opmod, 1121 list<LLVMType> retty, 1122 list<AMDGPUArg> dataargs, 1123 list<IntrinsicProperty> props, 1124 list<SDNodeProperty> sdnodeprops, 1125 bit Mip = false> { 1126 foreach dim = AMDGPUDims.All in { 1127 def !strconcat(NAME, "_", dim.Name) 1128 : AMDGPUImageDimIntrinsic< 1129 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 1130 props, sdnodeprops>; 1131 } 1132 } 1133 1134 defm int_amdgcn_image_load 1135 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], 1136 [SDNPMemOperand]>, 1137 AMDGPUImageDMaskIntrinsic; 1138 defm int_amdgcn_image_load_mip 1139 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], 1140 [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, 1141 AMDGPUImageDMaskIntrinsic; 1142 1143 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< 1144 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 1145 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>, 1146 AMDGPUImageDMaskIntrinsic; 1147 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< 1148 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 1149 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>, 1150 AMDGPUImageDMaskIntrinsic; 1151 1152 ////////////////////////////////////////////////////////////////////////// 1153 // MSAA intrinsics 1154 ////////////////////////////////////////////////////////////////////////// 1155 foreach dim = AMDGPUDims.Msaa in { 1156 def int_amdgcn_image_msaa_load_x # _ # dim.Name: 1157 AMDGPUImageDimIntrinsic< 1158 AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, 1159 [IntrReadMem], [SDNPMemOperand]>; 1160 } 1161 1162 foreach dim = AMDGPUDims.Msaa in { 1163 def int_amdgcn_image_msaa_load # _ # dim.Name: 1164 AMDGPUImageDimIntrinsic< 1165 AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, 1166 [IntrReadMem], [SDNPMemOperand]>; 1167 } 1168 1169 ////////////////////////////////////////////////////////////////////////// 1170 // sample and getlod intrinsics 1171 ////////////////////////////////////////////////////////////////////////// 1172 multiclass AMDGPUImageDimSampleDims<string opmod, 1173 AMDGPUSampleVariant sample, 1174 bit NoMem = false> { 1175 foreach dim = AMDGPUDims.NoMsaa in { 1176 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< 1177 AMDGPUDimSampleProfile<opmod, dim, sample>, 1178 !listconcat(!if(NoMem, [IntrNoMem], [IntrReadMem]), 1179 !if(sample.UsesWQM, [IntrConvergent], [])), 1180 !if(NoMem, [], [SDNPMemOperand])>; 1181 } 1182 } 1183 1184 foreach sample = AMDGPUSampleVariants in { 1185 defm int_amdgcn_image_sample # sample.LowerCaseMod 1186 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, 1187 AMDGPUImageDMaskIntrinsic; 1188 } 1189 1190 multiclass AMDGPUImageDimSampleNoReturnDims<string opmod, 1191 AMDGPUSampleVariant sample> { 1192 foreach dim = AMDGPUDims.NoMsaa in { 1193 def !strconcat(NAME, "_", dim.Name, "_nortn") : AMDGPUImageDimIntrinsic< 1194 AMDGPUDimSampleNoReturnProfile<opmod, dim, sample>, 1195 !listconcat([IntrWillReturn], !if(sample.UsesWQM, [IntrConvergent], [])), 1196 [SDNPMemOperand]>; 1197 } 1198 } 1199 foreach sample = AMDGPUSampleVariants in { 1200 defm int_amdgcn_image_sample # sample.LowerCaseMod 1201 : AMDGPUImageDimSampleNoReturnDims< 1202 "SAMPLE" # sample.UpperCaseMod # "_nortn", sample>, 1203 AMDGPUImageDMaskIntrinsic; 1204 } 1205 1206 defm int_amdgcn_image_getlod 1207 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, 1208 AMDGPUImageDMaskIntrinsic; 1209 1210 ////////////////////////////////////////////////////////////////////////// 1211 // getresinfo intrinsics 1212 ////////////////////////////////////////////////////////////////////////// 1213 foreach dim = AMDGPUDims.All in { 1214 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) 1215 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, 1216 AMDGPUImageDMaskIntrinsic; 1217 } 1218 1219 ////////////////////////////////////////////////////////////////////////// 1220 // gather4 intrinsics 1221 ////////////////////////////////////////////////////////////////////////// 1222 foreach sample = AMDGPUSampleVariantsNoGradients in { 1223 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { 1224 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: 1225 AMDGPUImageDimIntrinsic< 1226 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, 1227 [IntrReadMem], [SDNPMemOperand]>; 1228 } 1229 } 1230} 1231 1232////////////////////////////////////////////////////////////////////////// 1233// atomic intrinsics 1234////////////////////////////////////////////////////////////////////////// 1235defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { 1236 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, 1237 LLVMType rettype = llvm_anyint_ty> { 1238 foreach dim = AMDGPUDims.All in { 1239 def !strconcat(NAME, "_", dim.Name): 1240 AMDGPUImageDimIntrinsic<AMDGPUDimAtomicProfile<opmod, dim, dataargs, rettype>, 1241 [], [SDNPMemOperand]>; 1242 } 1243 } 1244 1245 multiclass AMDGPUImageDimAtomic<string opmod, LLVMType rettype = llvm_anyint_ty> : 1246 AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], rettype>; 1247 1248 multiclass AMDGPUImageDimFloatAtomic<string opmod> : 1249 AMDGPUImageDimAtomic<opmod, llvm_anyfloat_ty>; 1250 1251 multiclass AMDGPUImageDimAnyAtomic<string opmod> : 1252 AMDGPUImageDimAtomic<opmod, llvm_any_ty>; 1253 1254 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAnyAtomic<"ATOMIC_SWAP">; 1255 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; 1256 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; 1257 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; 1258 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; 1259 defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; 1260 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; 1261 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; 1262 defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; 1263 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; 1264 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; 1265 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; 1266 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; 1267 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; 1268 defm int_amdgcn_image_atomic_add_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_ADD_FLT">; 1269 defm int_amdgcn_image_atomic_min_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MIN_FLT">; 1270 defm int_amdgcn_image_atomic_max_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MAX_FLT">; 1271 1272 defm int_amdgcn_image_atomic_cmpswap : 1273 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, 1274 AMDGPUArg<LLVMMatchType<0>, "cmp">]>; 1275 1276 defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">; 1277 defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_BF16">; 1278} 1279 1280////////////////////////////////////////////////////////////////////////// 1281// Buffer intrinsics 1282////////////////////////////////////////////////////////////////////////// 1283 1284// Data type for buffer resources (V#). Maybe, in the future, we can create a 1285// similar one for textures (T#). 1286def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>; 1287 1288let TargetPrefix = "amdgcn" in { 1289 1290def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < 1291 [AMDGPUBufferRsrcTy], 1292 [llvm_anyptr_ty, // base 1293 llvm_i16_ty, // stride (and swizzle control) 1294 llvm_i32_ty, // NumRecords / extent 1295 llvm_i32_ty], // flags 1296 // Attributes lifted from ptrmask + some extra argument attributes. 1297 [IntrNoMem, ReadNone<ArgIndex<0>>, 1298 IntrSpeculatable, IntrWillReturn]>; 1299 1300defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { 1301 1302// Generate a buffer_load instruction that may be optimized to s_buffer_load if 1303// the offset argument is uniform. 1304def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < 1305 [llvm_any_ty], 1306 [llvm_v4i32_ty, // rsrc(SGPR) 1307 llvm_i32_ty, // byte offset 1308 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1309 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1310 // bit 3 = swz, bit 4 = scc (gfx90a) 1311 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1312 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1313 // bit 6 = swz 1314 // Note: volatile bit is **not** permitted here. 1315 [IntrNoMem, ImmArg<ArgIndex<2>>]>, 1316 AMDGPURsrcIntrinsic<0>; 1317 1318// Buffer intrinsics with separate raw and struct variants. The raw 1319// variant never has an index. The struct variant always has an index, even if 1320// it is const 0. A struct intrinsic with constant 0 index is different to the 1321// corresponding raw intrinsic on gfx9+ because the behavior of bound checking 1322// and swizzling changes depending on whether idxen is set in the instruction. 1323// These intrinsics also keep the offset and soffset arguments separate as 1324// they behave differently in bounds checking and swizzling. 1325 1326// The versions of these intrinsics that take <4 x i32> arguments are deprecated 1327// in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments, 1328// which allow for improved reasoning about memory accesses. 1329// 1330// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved 1331// through to final assembly selection and is used to signal that the buffer 1332// operation is volatile. 1333class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < 1334 [llvm_any_ty], 1335 [llvm_v4i32_ty, // rsrc(SGPR) 1336 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1337 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1338 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1339 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1340 // bit 3 = swz, bit 4 = scc (gfx90a) 1341 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1342 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1343 // bit 6 = swz 1344 // all: volatile op (bit 31, stripped at lowering) 1345 [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1346 AMDGPURsrcIntrinsic<0>; 1347def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; 1348def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; 1349 1350class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1351 [data_ty], 1352 [llvm_v4i32_ty, // rsrc(SGPR) 1353 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1354 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1355 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1356 // bit 1 = slc, 1357 // bit 2 = dlc on gfx10+), 1358 // swizzled buffer (bit 3 = swz)) 1359 [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1360 AMDGPURsrcIntrinsic<0>; 1361def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad; 1362 1363class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1364 [data_ty], 1365 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1366 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1367 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1368 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1369 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1370 // bit 3 = swz, bit 4 = scc (gfx90a) 1371 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1372 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1373 // bit 6 = swz 1374 // all: volatile op (bit 31, stripped at lowering) 1375 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1376 ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1377 AMDGPURsrcIntrinsic<0>; 1378def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>; 1379def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad; 1380 1381class AMDGPURawPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1382 [data_ty], 1383 [AMDGPUBufferRsrcTy,// rsrc(SGPR) 1384 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1385 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1386 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1387 // bit 1 = slc, 1388 // bit 2 = dlc on gfx10+), 1389 // swizzled buffer (bit 3 = swz)) 1390 [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1391 AMDGPURsrcIntrinsic<0>; 1392def int_amdgcn_raw_ptr_atomic_buffer_load : AMDGPURawPtrAtomicBufferLoad; 1393 1394class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1395 [data_ty], 1396 [llvm_v4i32_ty, // rsrc(SGPR) 1397 llvm_i32_ty, // vindex(VGPR) 1398 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1399 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1400 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1401 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1402 // bit 3 = swz, bit 4 = scc (gfx90a) 1403 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1404 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1405 // bit 6 = swz 1406 // all: volatile op (bit 31, stripped at lowering) 1407 [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1408 AMDGPURsrcIntrinsic<0>; 1409def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; 1410def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; 1411 1412class AMDGPUStructAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1413 [data_ty], 1414 [llvm_v4i32_ty, // rsrc(SGPR) 1415 llvm_i32_ty, // vindex(VGPR) 1416 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1417 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1418 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1419 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1420 // bit 3 = swz, bit 4 = scc (gfx90a) 1421 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1422 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1423 // bit 6 = swz 1424 // all: volatile op (bit 31, stripped at lowering) 1425 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1426 AMDGPURsrcIntrinsic<0>; 1427def int_amdgcn_struct_atomic_buffer_load : AMDGPUStructAtomicBufferLoad; 1428 1429class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1430 [data_ty], 1431 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1432 llvm_i32_ty, // vindex(VGPR) 1433 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1434 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1435 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1436 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1437 // bit 3 = swz, bit 4 = scc (gfx90a) 1438 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1439 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1440 // bit 6 = swz 1441 // all: volatile op (bit 31, stripped at lowering) 1442 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1443 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1444 AMDGPURsrcIntrinsic<0>; 1445def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad; 1446def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad; 1447 1448class AMDGPUStructPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1449 [data_ty], 1450 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1451 llvm_i32_ty, // vindex(VGPR) 1452 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1453 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1454 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1455 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1456 // bit 3 = swz, bit 4 = scc (gfx90a) 1457 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1458 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1459 // bit 6 = swz 1460 // all: volatile op (bit 31, stripped at lowering) 1461 [IntrArgMemOnly, NoCapture<ArgIndex<0>>, 1462 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1463 AMDGPURsrcIntrinsic<0>; 1464def int_amdgcn_struct_ptr_atomic_buffer_load : AMDGPUStructPtrAtomicBufferLoad; 1465 1466class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1467 [], 1468 [data_ty, // vdata(VGPR) 1469 llvm_v4i32_ty, // rsrc(SGPR) 1470 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1471 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1472 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1473 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1474 // bit 3 = swz, bit 4 = scc (gfx90a) 1475 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1476 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1477 // bit 6 = swz 1478 // all: volatile op (bit 31, stripped at lowering) 1479 [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1480 AMDGPURsrcIntrinsic<1>; 1481def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; 1482def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; 1483 1484class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1485 [], 1486 [data_ty, // vdata(VGPR) 1487 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1488 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1489 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1490 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1491 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1492 // bit 3 = swz, bit 4 = scc (gfx90a) 1493 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1494 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1495 // bit 6 = swz 1496 // all: volatile op (bit 31, stripped at lowering) 1497 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1498 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1499 AMDGPURsrcIntrinsic<1>; 1500def int_amdgcn_raw_ptr_buffer_store_format : AMDGPURawPtrBufferStore<llvm_anyfloat_ty>; 1501def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore; 1502 1503class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1504 [], 1505 [data_ty, // vdata(VGPR) 1506 llvm_v4i32_ty, // rsrc(SGPR) 1507 llvm_i32_ty, // vindex(VGPR) 1508 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1509 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1510 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1511 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1512 // bit 3 = swz, bit 4 = scc (gfx90a) 1513 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1514 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1515 // bit 6 = swz 1516 // all: volatile op (bit 31, stripped at lowering) 1517 [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1518 AMDGPURsrcIntrinsic<1>; 1519def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; 1520def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; 1521 1522class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1523 [], 1524 [data_ty, // vdata(VGPR) 1525 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1526 llvm_i32_ty, // vindex(VGPR) 1527 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1528 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1529 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1530 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1531 // bit 3 = swz, bit 4 = scc (gfx90a) 1532 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1533 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1534 // bit 6 = swz 1535 // all: volatile op (bit 31, stripped at lowering) 1536 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1537 ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1538 AMDGPURsrcIntrinsic<1>; 1539def int_amdgcn_struct_ptr_buffer_store_format : AMDGPUStructPtrBufferStore; 1540def int_amdgcn_struct_ptr_buffer_store : AMDGPUStructPtrBufferStore; 1541 1542class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1543 [data_ty], 1544 [LLVMMatchType<0>, // vdata(VGPR) 1545 llvm_v4i32_ty, // rsrc(SGPR) 1546 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1547 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1548 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1549 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1550 AMDGPURsrcIntrinsic<1, 0>; 1551def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; 1552def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; 1553def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; 1554def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; 1555def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; 1556def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1557def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; 1558def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; 1559def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1560def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; 1561def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; 1562def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; 1563def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; 1564def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; 1565def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; 1566def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< 1567 [llvm_anyint_ty], 1568 [LLVMMatchType<0>, // src(VGPR) 1569 LLVMMatchType<0>, // cmp(VGPR) 1570 llvm_v4i32_ty, // rsrc(SGPR) 1571 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1572 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1573 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1574 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1575 AMDGPURsrcIntrinsic<2, 0>; 1576 1577class AMDGPURawPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1578 [data_ty], 1579 [LLVMMatchType<0>, // vdata(VGPR) 1580 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1581 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1582 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1583 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1584 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1585 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1586 AMDGPURsrcIntrinsic<1, 0>; 1587 1588def int_amdgcn_raw_ptr_buffer_atomic_swap : AMDGPURawPtrBufferAtomic; 1589def int_amdgcn_raw_ptr_buffer_atomic_add : AMDGPURawPtrBufferAtomic; 1590def int_amdgcn_raw_ptr_buffer_atomic_sub : AMDGPURawPtrBufferAtomic; 1591def int_amdgcn_raw_ptr_buffer_atomic_smin : AMDGPURawPtrBufferAtomic; 1592def int_amdgcn_raw_ptr_buffer_atomic_umin : AMDGPURawPtrBufferAtomic; 1593def int_amdgcn_raw_ptr_buffer_atomic_fmin : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1594def int_amdgcn_raw_ptr_buffer_atomic_smax : AMDGPURawPtrBufferAtomic; 1595def int_amdgcn_raw_ptr_buffer_atomic_umax : AMDGPURawPtrBufferAtomic; 1596def int_amdgcn_raw_ptr_buffer_atomic_fmax : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1597def int_amdgcn_raw_ptr_buffer_atomic_and : AMDGPURawPtrBufferAtomic; 1598def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; 1599def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; 1600def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; 1601def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; 1602def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; 1603def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< 1604 [llvm_anyint_ty], 1605 [LLVMMatchType<0>, // src(VGPR) 1606 LLVMMatchType<0>, // cmp(VGPR) 1607 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1608 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1609 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1610 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1611 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1612 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1613 AMDGPURsrcIntrinsic<2, 0>; 1614 1615// gfx908 intrinsic 1616def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1617 1618// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx950, gfx12+. 1619def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1620 1621class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1622 [data_ty], 1623 [LLVMMatchType<0>, // vdata(VGPR) 1624 llvm_v4i32_ty, // rsrc(SGPR) 1625 llvm_i32_ty, // vindex(VGPR) 1626 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1627 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1628 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1629 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1630 AMDGPURsrcIntrinsic<1, 0>; 1631def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; 1632def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; 1633def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; 1634def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; 1635def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; 1636def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; 1637def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; 1638def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; 1639def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; 1640def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; 1641def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; 1642def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; 1643def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; 1644def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< 1645 [llvm_anyint_ty], 1646 [LLVMMatchType<0>, // src(VGPR) 1647 LLVMMatchType<0>, // cmp(VGPR) 1648 llvm_v4i32_ty, // rsrc(SGPR) 1649 llvm_i32_ty, // vindex(VGPR) 1650 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1651 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1652 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1653 [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1654 AMDGPURsrcIntrinsic<2, 0>; 1655 1656class AMDGPUStructPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1657 [data_ty], 1658 [LLVMMatchType<0>, // vdata(VGPR) 1659 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1660 llvm_i32_ty, // vindex(VGPR) 1661 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1662 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1663 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1664 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1665 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1666 AMDGPURsrcIntrinsic<1, 0>; 1667def int_amdgcn_struct_ptr_buffer_atomic_swap : AMDGPUStructPtrBufferAtomic; 1668def int_amdgcn_struct_ptr_buffer_atomic_add : AMDGPUStructPtrBufferAtomic; 1669def int_amdgcn_struct_ptr_buffer_atomic_sub : AMDGPUStructPtrBufferAtomic; 1670def int_amdgcn_struct_ptr_buffer_atomic_smin : AMDGPUStructPtrBufferAtomic; 1671def int_amdgcn_struct_ptr_buffer_atomic_umin : AMDGPUStructPtrBufferAtomic; 1672def int_amdgcn_struct_ptr_buffer_atomic_smax : AMDGPUStructPtrBufferAtomic; 1673def int_amdgcn_struct_ptr_buffer_atomic_umax : AMDGPUStructPtrBufferAtomic; 1674def int_amdgcn_struct_ptr_buffer_atomic_and : AMDGPUStructPtrBufferAtomic; 1675def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; 1676def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; 1677def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; 1678def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; 1679def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; 1680def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< 1681 [llvm_anyint_ty], 1682 [LLVMMatchType<0>, // src(VGPR) 1683 LLVMMatchType<0>, // cmp(VGPR) 1684 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1685 llvm_i32_ty, // vindex(VGPR) 1686 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1687 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1688 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1689 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1690 ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1691 AMDGPURsrcIntrinsic<2, 0>; 1692 1693// gfx908 intrinsic. Supports v2bf16 on gfx12+ and gfx950 1694def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1695def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1696 1697// gfx90a intrinsics 1698def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1699def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1700 1701def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1702def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1703 1704// tbuffer intrinsics, with: 1705// - raw and struct variants 1706// - joint format field 1707// - joint cachepolicy field 1708def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < 1709 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1710 [llvm_v4i32_ty, // rsrc(SGPR) 1711 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1712 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1713 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1714 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1715 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1716 // bit 3 = swz, bit 4 = scc (gfx90a) 1717 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1718 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1719 // bit 6 = swz 1720 [IntrReadMem, 1721 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1722 AMDGPURsrcIntrinsic<0>; 1723 1724def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1725 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1726 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1727 llvm_i32_ty, // offset(VGPR/imm, included in bounds` checking and swizzling) 1728 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1729 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1730 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1731 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1732 // bit 3 = swz, bit 4 = scc (gfx90a) 1733 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1734 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1735 // bit 6 = swz 1736 // all: volatile op (bit 31, stripped at lowering) 1737 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1738 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1739 AMDGPURsrcIntrinsic<0>; 1740 1741def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < 1742 [], 1743 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1744 llvm_v4i32_ty, // rsrc(SGPR) 1745 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1746 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1747 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1748 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1749 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1750 // bit 3 = swz, bit 4 = scc (gfx90a) 1751 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1752 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1753 // bit 6 = swz 1754 // all: volatile op (bit 31, stripped at lowering) 1755 [IntrWriteMem, 1756 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1757 AMDGPURsrcIntrinsic<1>; 1758 1759def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1760 [], 1761 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1762 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1763 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1764 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1765 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1766 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1767 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1768 // bit 3 = swz, bit 4 = scc (gfx90a) 1769 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1770 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1771 // bit 6 = swz 1772 // all: volatile op (bit 31, stripped at lowering) 1773 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1774 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1775 AMDGPURsrcIntrinsic<1>; 1776 1777def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < 1778 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1779 [llvm_v4i32_ty, // rsrc(SGPR) 1780 llvm_i32_ty, // vindex(VGPR) 1781 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1782 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1783 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1784 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1785 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1786 // bit 3 = swz, bit 4 = scc (gfx90a) 1787 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1788 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1789 // bit 6 = swz 1790 // all: volatile op (bit 31, stripped at lowering) 1791 [IntrReadMem, 1792 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1793 AMDGPURsrcIntrinsic<0>; 1794 1795def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1796 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1797 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1798 llvm_i32_ty, // vindex(VGPR) 1799 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1800 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1801 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1802 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1803 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1804 // bit 3 = swz, bit 4 = scc (gfx90a) 1805 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1806 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1807 // bit 6 = swz 1808 // all: volatile op (bit 31, stripped at lowering) 1809 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1810 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1811 AMDGPURsrcIntrinsic<0>; 1812 1813def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1814 [], 1815 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1816 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1817 llvm_i32_ty, // vindex(VGPR) 1818 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1819 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1820 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1821 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1822 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1823 // bit 3 = swz, bit 4 = scc (gfx90a) 1824 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1825 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1826 // bit 6 = swz 1827 // all: volatile op (bit 31, stripped at lowering) 1828 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1829 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1830 AMDGPURsrcIntrinsic<1>; 1831 1832def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < 1833 [], 1834 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1835 llvm_v4i32_ty, // rsrc(SGPR) 1836 llvm_i32_ty, // vindex(VGPR) 1837 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1838 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1839 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1840 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1841 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1842 // bit 3 = swz, bit 4 = scc (gfx90a) 1843 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1844 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1845 // bit 6 = swz 1846 // all: volatile op (bit 31, stripped at lowering) 1847 [IntrWriteMem, 1848 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1849 AMDGPURsrcIntrinsic<1>; 1850 1851class AMDGPURawBufferLoadLDS : Intrinsic < 1852 [], 1853 [llvm_v4i32_ty, // rsrc(SGPR) 1854 LLVMQualPointerType<3>, // LDS base offset 1855 llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) 1856 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1857 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1858 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1859 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1860 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1861 // bit 3 = swz, bit 4 = scc (gfx90a) 1862 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1863 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1864 // bit 6 = swz 1865 // all: volatile op (bit 31, stripped at lowering) 1866 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1867 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1868def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; 1869 1870class AMDGPURawPtrBufferLoadLDS : Intrinsic < 1871 [], 1872 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1873 LLVMQualPointerType<3>, // LDS base offset 1874 llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) 1875 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1876 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1877 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1878 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1879 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1880 // bit 3 = swz, bit 4 = scc (gfx90a) 1881 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1882 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1883 // bit 6 = swz 1884 // all: volatile op (bit 31, stripped at lowering) 1885 [IntrWillReturn, IntrArgMemOnly, 1886 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1887 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1888 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1889 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1890def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS; 1891 1892class AMDGPUStructBufferLoadLDS : Intrinsic < 1893 [], 1894 [llvm_v4i32_ty, // rsrc(SGPR) 1895 LLVMQualPointerType<3>, // LDS base offset 1896 llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) 1897 llvm_i32_ty, // vindex(VGPR) 1898 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1899 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1900 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1901 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1902 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1903 // bit 3 = swz, bit 4 = scc (gfx90a) 1904 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1905 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1906 // bit 6 = swz 1907 // all: volatile op (bit 31, stripped at lowering) 1908 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1909 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1910def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; 1911 1912class AMDGPUStructPtrBufferLoadLDS : Intrinsic < 1913 [], 1914 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1915 LLVMQualPointerType<3>, // LDS base offset 1916 llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) 1917 llvm_i32_ty, // vindex(VGPR) 1918 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1919 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1920 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1921 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1922 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1923 // bit 3 = swz, bit 4 = scc (gfx90a) 1924 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1925 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1926 // bit 6 = swz 1927 // all: volatile op (bit 31, stripped at lowering) 1928 [IntrWillReturn, IntrArgMemOnly, 1929 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1930 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1931 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1932 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1933def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS; 1934 1935def int_amdgcn_s_buffer_prefetch_data : DefaultAttrsIntrinsic < 1936 [], 1937 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1938 llvm_i32_ty, // offset (imm) 1939 llvm_i32_ty], // len (SGPR/imm) 1940 [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<1>>], "", [SDNPMemOperand]>, 1941 AMDGPURsrcIntrinsic<0>, 1942 ClangBuiltin<"__builtin_amdgcn_s_buffer_prefetch_data">; 1943 1944} // defset AMDGPUBufferIntrinsics 1945 1946// Uses that do not set the done bit should set IntrWriteMem on the 1947// call site. 1948def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ 1949 llvm_i32_ty, // tgt, 1950 llvm_i32_ty, // en 1951 llvm_any_ty, // src0 (f32 or i32) 1952 LLVMMatchType<0>, // src1 1953 LLVMMatchType<0>, // src2 1954 LLVMMatchType<0>, // src3 1955 llvm_i1_ty, // done 1956 llvm_i1_ty // vm (ignored on GFX11+) 1957 ], 1958 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1959 ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly] 1960>; 1961 1962// exp with row_en bit set. Only supported on GFX11+. 1963def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [ 1964 llvm_i32_ty, // tgt, 1965 llvm_i32_ty, // en 1966 llvm_any_ty, // src0 (f32 or i32) 1967 LLVMMatchType<0>, // src1 1968 LLVMMatchType<0>, // src2 1969 LLVMMatchType<0>, // src3 1970 llvm_i1_ty, // done 1971 llvm_i32_ty], // row number 1972 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1973 IntrWriteMem, IntrInaccessibleMemOnly] 1974>; 1975 1976// exp with compr bit set. Not supported on GFX11+. 1977def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [ 1978 llvm_i32_ty, // tgt, 1979 llvm_i32_ty, // en 1980 llvm_anyvector_ty, // src0 (v2f16 or v2i16) 1981 LLVMMatchType<0>, // src1 1982 llvm_i1_ty, // done 1983 llvm_i1_ty], // vm 1984 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, 1985 ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly] 1986>; 1987 1988def int_amdgcn_buffer_wbinvl1_sc : 1989 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, 1990 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1991 1992def int_amdgcn_buffer_wbinvl1 : 1993 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, 1994 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1995 1996def int_amdgcn_s_dcache_inv : 1997 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, 1998 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1999 2000def int_amdgcn_s_memtime : 2001 ClangBuiltin<"__builtin_amdgcn_s_memtime">, 2002 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; 2003 2004def int_amdgcn_s_sleep : 2005 ClangBuiltin<"__builtin_amdgcn_s_sleep">, 2006 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 2007 IntrHasSideEffects]> { 2008} 2009 2010def int_amdgcn_s_sleep_var 2011 : ClangBuiltin<"__builtin_amdgcn_s_sleep_var">, 2012 Intrinsic<[], [llvm_i32_ty], 2013 [IntrNoMem, IntrHasSideEffects, IntrWillReturn]> { 2014} 2015 2016def int_amdgcn_s_nop : 2017 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 2018 IntrHasSideEffects]> { 2019} 2020 2021def int_amdgcn_s_incperflevel : 2022 ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, 2023 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 2024 IntrHasSideEffects]> { 2025} 2026 2027def int_amdgcn_s_decperflevel : 2028 ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, 2029 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 2030 IntrHasSideEffects]> { 2031} 2032 2033def int_amdgcn_s_sethalt : 2034 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 2035 IntrHasSideEffects]>; 2036 2037def int_amdgcn_s_setprio : 2038 ClangBuiltin<"__builtin_amdgcn_s_setprio">, 2039 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 2040 IntrHasSideEffects]>; 2041 2042def int_amdgcn_s_ttracedata : 2043 ClangBuiltin<"__builtin_amdgcn_s_ttracedata">, 2044 DefaultAttrsIntrinsic<[], [llvm_i32_ty], 2045 [IntrNoMem, IntrHasSideEffects]>; 2046 2047def int_amdgcn_s_ttracedata_imm : 2048 ClangBuiltin<"__builtin_amdgcn_s_ttracedata_imm">, 2049 DefaultAttrsIntrinsic<[], [llvm_i16_ty], 2050 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>; 2051 2052// This is IntrHasSideEffects so it can be used to read cycle counters. 2053def int_amdgcn_s_getreg : 2054 ClangBuiltin<"__builtin_amdgcn_s_getreg">, 2055 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], 2056 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 2057>; 2058 2059// Note this can be used to set FP environment properties that are 2060// unsafe to change in non-strictfp functions. The register properties 2061// available (and value required to access them) may differ per 2062// subtarget. llvm.amdgcn.s.setreg(hwmode, value) 2063def int_amdgcn_s_setreg : 2064 ClangBuiltin<"__builtin_amdgcn_s_setreg">, 2065 DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty], 2066 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 2067>; 2068 2069// int_amdgcn_s_getpc is provided to allow a specific style of position 2070// independent code to determine the high part of its address when it is 2071// known (through convention) that the code and any data of interest does 2072// not cross a 4Gb address boundary. Use for any other purpose may not 2073// produce the desired results as optimizations may cause code movement, 2074// especially as we explicitly use IntrNoMem to allow optimizations. 2075// This intrinsic always returns PC sign-extended from 48 bits even if the 2076// s_getpc_b64 instruction returns a zero-extended value. 2077def int_amdgcn_s_getpc : 2078 ClangBuiltin<"__builtin_amdgcn_s_getpc">, 2079 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [NoUndef<RetIndex>, IntrNoMem, 2080 IntrSpeculatable, IntrWillReturn]>; 2081 2082// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> 2083// param values: 0 = P10, 1 = P20, 2 = P0 2084def int_amdgcn_interp_mov : 2085 ClangBuiltin<"__builtin_amdgcn_interp_mov">, 2086 DefaultAttrsIntrinsic<[llvm_float_ty], 2087 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2088 [IntrNoMem, IntrSpeculatable, 2089 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 2090 2091// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> 2092// This intrinsic reads from lds, but the memory values are constant, 2093// so it behaves like IntrNoMem. 2094def int_amdgcn_interp_p1 : 2095 ClangBuiltin<"__builtin_amdgcn_interp_p1">, 2096 DefaultAttrsIntrinsic<[llvm_float_ty], 2097 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2098 [IntrNoMem, IntrSpeculatable, 2099 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 2100 2101// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> 2102def int_amdgcn_interp_p2 : 2103 ClangBuiltin<"__builtin_amdgcn_interp_p2">, 2104 DefaultAttrsIntrinsic<[llvm_float_ty], 2105 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2106 [IntrNoMem, IntrSpeculatable, 2107 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 2108 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. 2109 2110// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> 2111// high selects whether high or low 16-bits are loaded from LDS 2112def int_amdgcn_interp_p1_f16 : 2113 ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, 2114 DefaultAttrsIntrinsic<[llvm_float_ty], 2115 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 2116 [IntrNoMem, IntrSpeculatable, 2117 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 2118 2119// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> 2120// high selects whether high or low 16-bits are loaded from LDS 2121def int_amdgcn_interp_p2_f16 : 2122 ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, 2123 DefaultAttrsIntrinsic<[llvm_half_ty], 2124 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 2125 [IntrNoMem, IntrSpeculatable, 2126 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 2127 2128// llvm.amdgcn.lds.direct.load <m0> 2129// The input argument is m0, which contains a packed combination of address 2130// offset and flags describing the data type. 2131def int_amdgcn_lds_direct_load : 2132 DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 2133 [llvm_i32_ty], 2134 [IntrReadMem, IntrSpeculatable]>; 2135 2136// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> 2137// Like interp intrinsics, this reads from lds, but the memory values are constant, 2138// so it behaves like IntrNoMem. 2139def int_amdgcn_lds_param_load : 2140 DefaultAttrsIntrinsic<[llvm_float_ty], 2141 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2142 [IntrNoMem, IntrSpeculatable, 2143 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; 2144 2145// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> 2146def int_amdgcn_interp_inreg_p10 : 2147 DefaultAttrsIntrinsic<[llvm_float_ty], 2148 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 2149 [IntrNoMem, IntrSpeculatable]>; 2150 2151// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> 2152def int_amdgcn_interp_inreg_p2 : 2153 DefaultAttrsIntrinsic<[llvm_float_ty], 2154 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 2155 [IntrNoMem, IntrSpeculatable]>; 2156 2157// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> 2158// high selects whether high or low 16-bits are used for p and p0 operands 2159def int_amdgcn_interp_inreg_p10_f16: 2160 DefaultAttrsIntrinsic<[llvm_float_ty], 2161 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2162 [IntrNoMem, IntrSpeculatable, 2163 ImmArg<ArgIndex<3>>]>; 2164 2165// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> 2166// high selects whether high or low 16-bits are used for p operand 2167def int_amdgcn_interp_inreg_p2_f16 : 2168 DefaultAttrsIntrinsic<[llvm_half_ty], 2169 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2170 [IntrNoMem, IntrSpeculatable, 2171 ImmArg<ArgIndex<3>>]>; 2172 2173// llvm.amdgcn.interp.p10.rtz.f16 <p>, <i>, <p0>, <high> 2174// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode. 2175// high selects whether high or low 16-bits are used for p and p0 operands 2176def int_amdgcn_interp_p10_rtz_f16: 2177 DefaultAttrsIntrinsic<[llvm_float_ty], 2178 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2179 [IntrNoMem, IntrSpeculatable, 2180 ImmArg<ArgIndex<3>>]>; 2181 2182// llvm.amdgcn.interp.p2.rtz.f16 <p>, <j>, <tmp>, <high> 2183// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode. 2184// high selects whether high or low 16-bits are used for p operand 2185def int_amdgcn_interp_p2_rtz_f16 : 2186 DefaultAttrsIntrinsic<[llvm_half_ty], 2187 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2188 [IntrNoMem, IntrSpeculatable, 2189 ImmArg<ArgIndex<3>>]>; 2190 2191// Deprecated: use llvm.amdgcn.live.mask instead. 2192def int_amdgcn_ps_live : DefaultAttrsIntrinsic < 2193 [llvm_i1_ty], 2194 [], 2195 [IntrNoMem]>; 2196 2197// Query currently live lanes. 2198// Returns true if lane is live (and not a helper lane). 2199def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty], 2200 [], [NoUndef<RetIndex>, IntrReadMem, IntrInaccessibleMemOnly] 2201>; 2202 2203def int_amdgcn_mbcnt_lo : 2204 ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, 2205 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2206 [IntrNoMem]>; 2207 2208def int_amdgcn_mbcnt_hi : 2209 ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, 2210 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2211 [IntrNoMem]>; 2212 2213// llvm.amdgcn.ds.swizzle src offset 2214def int_amdgcn_ds_swizzle : 2215 ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, 2216 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2217 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, 2218 ImmArg<ArgIndex<1>>]>; 2219 2220def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2221 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 2222 [IntrNoMem, IntrSpeculatable] 2223>; 2224 2225def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2226 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 2227 [IntrNoMem, IntrSpeculatable] 2228>; 2229 2230def int_amdgcn_lerp : 2231 ClangBuiltin<"__builtin_amdgcn_lerp">, 2232 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2233 [IntrNoMem, IntrSpeculatable] 2234>; 2235 2236def int_amdgcn_sad_u8 : 2237 ClangBuiltin<"__builtin_amdgcn_sad_u8">, 2238 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2239 [IntrNoMem, IntrSpeculatable] 2240>; 2241 2242def int_amdgcn_msad_u8 : 2243 ClangBuiltin<"__builtin_amdgcn_msad_u8">, 2244 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2245 [IntrNoMem, IntrSpeculatable] 2246>; 2247 2248def int_amdgcn_sad_hi_u8 : 2249 ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, 2250 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2251 [IntrNoMem, IntrSpeculatable] 2252>; 2253 2254def int_amdgcn_sad_u16 : 2255 ClangBuiltin<"__builtin_amdgcn_sad_u16">, 2256 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2257 [IntrNoMem, IntrSpeculatable] 2258>; 2259 2260def int_amdgcn_qsad_pk_u16_u8 : 2261 ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, 2262 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 2263 [IntrNoMem, IntrSpeculatable] 2264>; 2265 2266def int_amdgcn_mqsad_pk_u16_u8 : 2267 ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, 2268 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 2269 [IntrNoMem, IntrSpeculatable] 2270>; 2271 2272def int_amdgcn_mqsad_u32_u8 : 2273 ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, 2274 DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], 2275 [IntrNoMem, IntrSpeculatable] 2276>; 2277 2278def int_amdgcn_cvt_pk_u8_f32 : 2279 ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, 2280 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], 2281 [IntrNoMem, IntrSpeculatable] 2282>; 2283 2284def int_amdgcn_icmp : 2285 Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], 2286 [IntrNoMem, IntrConvergent, 2287 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2288 2289def int_amdgcn_fcmp : 2290 Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], 2291 [IntrNoMem, IntrConvergent, 2292 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2293 2294// Returns a bitfield(i32 or i64) containing the result of its i1 argument 2295// in all active lanes, and zero in all inactive lanes. 2296def int_amdgcn_ballot : 2297 Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], 2298 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2299 2300def int_amdgcn_inverse_ballot : 2301 Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], 2302 [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2303 2304// Lowers to S_BITREPLICATE_B64_B32. 2305// The argument must be uniform; otherwise, the result is undefined. 2306def int_amdgcn_s_bitreplicate : 2307 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; 2308 2309// Lowers to S_QUADMASK_B{32,64} 2310// The argument must be uniform; otherwise, the result is undefined. 2311def int_amdgcn_s_quadmask : 2312 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2313 2314// Lowers to S_WQM_B{32,64} 2315// The argument must be uniform; otherwise, the result is undefined. 2316// Does not set WQM; merely calculates the bitmask. 2317def int_amdgcn_s_wqm : 2318 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2319 2320class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< 2321 [data_ty], 2322 [ 2323 LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) 2324 llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default, 2325 // 1: Iterative strategy, and 2326 // 2. DPP) 2327 ], 2328 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>; 2329 2330def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; 2331def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; 2332 2333def int_amdgcn_readfirstlane : 2334 Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], 2335 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2336 2337// The lane argument must be uniform across the currently active threads of the 2338// current wave. Otherwise, the result is undefined. 2339def int_amdgcn_readlane : 2340 Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], 2341 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2342 2343// The value to write and lane select arguments must be uniform across the 2344// currently active threads of the current wave. Otherwise, the result is 2345// undefined. 2346def int_amdgcn_writelane : 2347 Intrinsic<[llvm_any_ty], [ 2348 LLVMMatchType<0>, // uniform value to write: returned by the selected lane 2349 llvm_i32_ty, // uniform lane select 2350 LLVMMatchType<0> // returned by all lanes other than the selected one 2351 ], 2352 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2353>; 2354 2355def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, 2356 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2357 [IntrNoMem, IntrSpeculatable] 2358>; 2359 2360// mul24 intrinsics can return i32 or i64. 2361// When returning i64, they're lowered to a mul24/mulhi24 pair. 2362def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2363 [llvm_i32_ty, llvm_i32_ty], 2364 [IntrNoMem, IntrSpeculatable] 2365>; 2366 2367def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2368 [llvm_i32_ty, llvm_i32_ty], 2369 [IntrNoMem, IntrSpeculatable] 2370>; 2371 2372def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2373 [llvm_i32_ty, llvm_i32_ty], 2374 [IntrNoMem, IntrSpeculatable] 2375>; 2376 2377def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2378 [llvm_i32_ty, llvm_i32_ty], 2379 [IntrNoMem, IntrSpeculatable] 2380>; 2381 2382// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) 2383// 2384// bar_val is the total number of waves that will wait on this 2385// barrier, minus 1. 2386def int_amdgcn_ds_gws_init : 2387 ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, 2388 Intrinsic<[], 2389 [llvm_i32_ty, llvm_i32_ty], 2390 [IntrConvergent, IntrWriteMem, 2391 IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2392 [SDNPMemOperand] 2393>; 2394 2395// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) 2396// bar_val is the total number of waves that will wait on this 2397// barrier, minus 1. 2398def int_amdgcn_ds_gws_barrier : 2399 ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, 2400 Intrinsic<[], 2401 [llvm_i32_ty, llvm_i32_ty], 2402 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2403 [SDNPMemOperand] 2404>; 2405 2406// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) 2407def int_amdgcn_ds_gws_sema_v : 2408 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, 2409 Intrinsic<[], 2410 [llvm_i32_ty], 2411 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2412 [SDNPMemOperand] 2413>; 2414 2415// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) 2416def int_amdgcn_ds_gws_sema_br : 2417 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, 2418 Intrinsic<[], 2419 [llvm_i32_ty, llvm_i32_ty], 2420 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2421 [SDNPMemOperand] 2422>; 2423 2424// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) 2425def int_amdgcn_ds_gws_sema_p : 2426 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, 2427 Intrinsic<[], 2428 [llvm_i32_ty], 2429 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2430 [SDNPMemOperand] 2431>; 2432 2433// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) 2434def int_amdgcn_ds_gws_sema_release_all : 2435 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, 2436 Intrinsic<[], 2437 [llvm_i32_ty], 2438 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2439 [SDNPMemOperand] 2440>; 2441 2442 2443// Copies the source value to the destination value, with the guarantee that 2444// the source value is computed as if the entire program were executed in WQM. 2445def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], 2446 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2447>; 2448 2449// Copies the source value to the destination value, such that the source 2450// is computed as if the entire program were executed in WQM if any other 2451// program code executes in WQM. 2452def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], 2453 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2454>; 2455 2456// Return true if at least one thread within the pixel quad passes true into 2457// the function. 2458def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], 2459 [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2460>; 2461 2462// If false, set EXEC=0 for the current thread until the end of program. 2463// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? 2464def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>; 2465 2466def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, 2467 Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrConvergent, 2468 IntrNoCallback, IntrNoFree] 2469>; 2470 2471// If false, mark all active lanes as helper lanes until the end of program. 2472def int_amdgcn_wqm_demote : Intrinsic<[], 2473 [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree] 2474>; 2475 2476// Copies the active channels of the source value to the destination value, 2477// with the guarantee that the source value is computed as if the entire 2478// program were executed in Whole Wavefront Mode, i.e. with all channels 2479// enabled, with a few exceptions: - Phi nodes which require WWM return an 2480// undefined value. 2481def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], 2482 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2483 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2484>; 2485// Deprecated. Use int_amdgcn_strict_wwm instead. 2486def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], 2487 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2488 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2489>; 2490def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], 2491 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2492 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2493>; 2494 2495// Given a value, copies it while setting all the inactive lanes to a given 2496// value. Note that OpenGL helper lanes are considered active, so if the 2497// program ever uses WQM, then the instruction and the first source will be 2498// computed in WQM. 2499def int_amdgcn_set_inactive : 2500 Intrinsic<[llvm_any_ty], 2501 [LLVMMatchType<0>, // value to be copied 2502 LLVMMatchType<0>], // value for the inactive lanes to take 2503 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2504 2505// Similar to int_amdgcn_set_inactive, but the value for the inactive lanes must 2506// be a VGPR function argument. 2507// Can only be used in functions with the `amdgpu_cs_chain` or 2508// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2509// flow. 2510def int_amdgcn_set_inactive_chain_arg : 2511 Intrinsic<[llvm_anyint_ty], 2512 [LLVMMatchType<0>, // value to be copied 2513 LLVMMatchType<0>], // value for the inactive lanes to take 2514 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2515 2516// Return if the given flat pointer points to a local memory address. 2517def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, 2518 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2519 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2520>; 2521 2522// Return if the given flat pointer points to a prvate memory address. 2523def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, 2524 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2525 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2526>; 2527 2528// A uniform tail call to a function with the `amdgpu_cs_chain` or 2529// `amdgpu_cs_chain_preserve` calling convention. It will populate the SGPRs 2530// starting at s0 and the VGPRs starting at v8, set EXEC and perform a jump to 2531// the given function. 2532// Can only be used in functions with the `amdgpu_cs`, `amdgpu_cs_chain` or 2533// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2534// flow. 2535def int_amdgcn_cs_chain: 2536 Intrinsic<[], 2537 [llvm_anyptr_ty, // The function to jump to. 2538 llvm_anyint_ty, // Value to put in EXEC (should be i32 or i64). 2539 llvm_any_ty, // Arguments that will be copied into SGPRs (s0+). 2540 // Must be uniform. 2541 llvm_any_ty, // Arguments that will be copied into VGPRs (v8+). 2542 // Need not be uniform. 2543 llvm_i32_ty, // Flags. 2544 llvm_vararg_ty // Additional arguments. Only present if Flags is 2545 // non-zero. 2546 ], 2547 [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>; 2548 2549 2550//===----------------------------------------------------------------------===// 2551// CI+ Intrinsics 2552//===----------------------------------------------------------------------===// 2553 2554def int_amdgcn_s_dcache_inv_vol : 2555 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, 2556 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2557 2558def int_amdgcn_buffer_wbinvl1_vol : 2559 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, 2560 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2561 2562//===----------------------------------------------------------------------===// 2563// VI Intrinsics 2564//===----------------------------------------------------------------------===// 2565 2566// The llvm.amdgcn.mov.dpp intrinsic represents the mov.dpp operation in AMDGPU. 2567// This operation is being deprecated and can be replaced with 2568// llvm.amdgcn.update.dpp. 2569// llvm.amdgcn.mov.dpp <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2570def int_amdgcn_mov_dpp : 2571 Intrinsic<[llvm_anyint_ty], 2572 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, 2573 llvm_i1_ty], 2574 [IntrNoMem, IntrConvergent, IntrWillReturn, 2575 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, 2576 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2577 2578// The llvm.amdgcn.update.dpp intrinsic represents the update.dpp operation in 2579// AMDGPU. It takes an old value, a source operand, a DPP control operand, a row 2580// mask, a bank mask, and a bound control. This operation is equivalent to a 2581// sequence of v_mov_b32 operations. It is preferred over llvm.amdgcn.mov.dpp 2582// for future use. 2583// llvm.amdgcn.update.dpp <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2584// Should be equivalent to: 2585// v_mov_b32 <dest> <old> 2586// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2587def int_amdgcn_update_dpp : 2588 Intrinsic<[llvm_any_ty], 2589 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, 2590 llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], 2591 [IntrNoMem, IntrConvergent, IntrWillReturn, 2592 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, 2593 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2594 2595def int_amdgcn_s_dcache_wb : 2596 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, 2597 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2598 2599def int_amdgcn_s_dcache_wb_vol : 2600 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, 2601 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2602 2603def int_amdgcn_s_memrealtime : 2604 ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, 2605 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2606 2607// llvm.amdgcn.ds.permute <index> <src> 2608def int_amdgcn_ds_permute : 2609 ClangBuiltin<"__builtin_amdgcn_ds_permute">, 2610 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2611 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2612 2613// llvm.amdgcn.ds.bpermute <index> <src> 2614def int_amdgcn_ds_bpermute : 2615 ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, 2616 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2617 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2618 2619// llvm.amdgcn.perm <src0> <src1> <selector> 2620def int_amdgcn_perm : 2621 ClangBuiltin<"__builtin_amdgcn_perm">, 2622 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2623 [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2624 2625//===----------------------------------------------------------------------===// 2626// GFX9 Intrinsics 2627//===----------------------------------------------------------------------===// 2628 2629class AMDGPUGlobalLoadLDS : 2630 ClangBuiltin<"__builtin_amdgcn_global_load_lds">, 2631 Intrinsic < 2632 [], 2633 [LLVMQualPointerType<1>, // Base global pointer to load from 2634 LLVMQualPointerType<3>, // LDS base pointer to store to 2635 llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) 2636 llvm_i32_ty, // imm offset (applied to both global and LDS address) 2637 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, 2638 // bit 1 = sc1, 2639 // bit 4 = scc)) 2640 [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, 2641 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], 2642 "", [SDNPMemOperand]>; 2643def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; 2644 2645// This is IntrHasSideEffects because it reads from a volatile hardware register. 2646def int_amdgcn_pops_exiting_wave_id : 2647 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>; 2648 2649//===----------------------------------------------------------------------===// 2650// GFX10 Intrinsics 2651//===----------------------------------------------------------------------===// 2652 2653// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> 2654def int_amdgcn_permlane16 : 2655 Intrinsic<[llvm_any_ty], 2656 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2657 [IntrNoMem, IntrConvergent, IntrWillReturn, 2658 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2659 2660// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> 2661def int_amdgcn_permlanex16 : 2662 Intrinsic<[llvm_any_ty], 2663 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2664 [IntrNoMem, IntrConvergent, IntrWillReturn, 2665 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2666 2667// llvm.amdgcn.mov.dpp8 <src> <sel> 2668// <sel> is a 32-bit constant whose high 8 bits must be zero which selects 2669// the lanes to read from. 2670def int_amdgcn_mov_dpp8 : 2671 Intrinsic<[llvm_any_ty], 2672 [LLVMMatchType<0>, llvm_i32_ty], 2673 [IntrNoMem, IntrConvergent, IntrWillReturn, 2674 ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>; 2675 2676def int_amdgcn_s_get_waveid_in_workgroup : 2677 ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, 2678 Intrinsic<[llvm_i32_ty], [], 2679 [NoUndef<RetIndex>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2680 2681class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic < 2682 [vt], 2683 [pt, // vaddr 2684 vt], // vdata(VGPR) 2685 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", 2686 [SDNPMemOperand]>; 2687 2688def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>; 2689 2690// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, 2691// <ray_dir>, <ray_inv_dir>, <texture_descr> 2692// <node_ptr> is i32 or i64. 2693// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. 2694def int_amdgcn_image_bvh_intersect_ray : 2695 DefaultAttrsIntrinsic<[llvm_v4i32_ty], 2696 [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, 2697 LLVMMatchType<1>, llvm_v4i32_ty], 2698 [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2699 2700//===----------------------------------------------------------------------===// 2701// GFX11 Intrinsics 2702//===----------------------------------------------------------------------===// 2703 2704// llvm.amdgcn.permlane64 <src0> 2705def int_amdgcn_permlane64 : 2706 Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], 2707 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2708 2709def int_amdgcn_ds_add_gs_reg_rtn : 2710 ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, 2711 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2712 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2713 "", [SDNPMemOperand]>; 2714 2715def int_amdgcn_ds_sub_gs_reg_rtn : 2716 ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, 2717 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2718 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2719 "", [SDNPMemOperand]>; 2720 2721def int_amdgcn_ds_bvh_stack_rtn : 2722 Intrinsic< 2723 [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr 2724 [ 2725 llvm_i32_ty, // %addr 2726 llvm_i32_ty, // %data0 2727 llvm_v4i32_ty, // %data1 2728 llvm_i32_ty, // %offset 2729 ], 2730 [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2731 >; 2732 2733def int_amdgcn_s_wait_event_export_ready : 2734 ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, 2735 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] 2736>; 2737 2738// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2739// 2740// These operations perform a matrix multiplication and accumulation of 2741// the form: D = A * B + C . 2742 2743class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : 2744 Intrinsic< 2745 [CD], // %D 2746 [ 2747 AB, // %A 2748 LLVMMatchType<1>, // %B 2749 LLVMMatchType<0>, // %C 2750 ], 2751 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2752>; 2753 2754class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : 2755 Intrinsic< 2756 [CD], // %D 2757 [ 2758 AB, // %A 2759 LLVMMatchType<1>, // %B 2760 LLVMMatchType<0>, // %C 2761 llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12 2762 ], 2763 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2764>; 2765 2766class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : 2767 Intrinsic< 2768 [CD], // %D 2769 [ 2770 llvm_i1_ty, // %A_sign 2771 AB, // %A 2772 llvm_i1_ty, // %B_sign 2773 LLVMMatchType<1>, // %B 2774 LLVMMatchType<0>, // %C 2775 llvm_i1_ty, // %clamp 2776 ], 2777 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2778>; 2779 2780// WMMA GFX11Only 2781 2782// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. 2783// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers. 2784// The content of the other 16-bit half is preserved from the input. 2785 2786defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX11 = { 2787def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2788def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>; 2789 2790// WMMA GFX11Plus 2791 2792def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2793def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2794def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2795def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2796 2797// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. 2798// The content of the other 16-bit half is undefined. 2799// GFX12: The op_sel bit must be 0. 2800def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2801def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>; 2802} 2803 2804//===----------------------------------------------------------------------===// 2805// GFX12 Intrinsics 2806//===----------------------------------------------------------------------===// 2807 2808// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control> 2809def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, 2810 Intrinsic<[llvm_i32_ty], 2811 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2812 [IntrNoMem, IntrConvergent, IntrWillReturn, 2813 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2814 2815// llvm.amdgcn.permlanex16.var <old> <src0> <src1> <fi> <bound_control> 2816def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">, 2817 Intrinsic<[llvm_i32_ty], 2818 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2819 [IntrNoMem, IntrConvergent, IntrWillReturn, 2820 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2821 2822// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics 2823// 2824// These operations perform a sparse matrix multiplication and accumulation of 2825// the form: D = A * B + C. 2826// A is sparse matrix, half the size of B, and is expanded using sparsity index. 2827 2828class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : 2829 Intrinsic< 2830 [CD], // %D 2831 [ 2832 A, // %A 2833 B, // %B 2834 LLVMMatchType<0>, // %C 2835 Index // %Sparsity index for A 2836 ], 2837 [IntrNoMem, IntrConvergent, IntrWillReturn] 2838>; 2839 2840class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : 2841 Intrinsic< 2842 [CD], // %D 2843 [ 2844 llvm_i1_ty, // %A_sign 2845 A, // %A 2846 llvm_i1_ty, // %B_sign 2847 B, // %B 2848 LLVMMatchType<0>, // %C 2849 Index, // %Sparsity index for A 2850 llvm_i1_ty, // %clamp 2851 ], 2852 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>] 2853>; 2854 2855defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX12 = { 2856// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2857// 2858// These operations perform a matrix multiplication and accumulation of 2859// the form: D = A * B + C . 2860 2861// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. 2862def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2863def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2864def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2865def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2866// A and B are <16 x iu4>. 2867def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2868 2869def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2870def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2871def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2872def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2873def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2874def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2875def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2876def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2877def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2878def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2879def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2880} 2881 2882def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>; 2883 2884def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2885def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2886def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2887def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2888 2889def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>; 2890 2891class AMDGPULoadIntrinsic<LLVMType ptr_ty>: 2892 Intrinsic< 2893 [llvm_any_ty], 2894 [ptr_ty], 2895 [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], 2896 "", 2897 [SDNPMemOperand] 2898 >; 2899 2900// Wave32 2901// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64 2902// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128 2903// Wave64 2904// i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64 2905// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128 2906 2907def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; 2908def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>; 2909def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; 2910def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>; 2911def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; 2912def int_amdgcn_ds_read_tr16_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; 2913 2914// i32 @llvm.amdgcn.wave.id() 2915def int_amdgcn_wave_id : 2916 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 2917 2918def int_amdgcn_s_prefetch_data : 2919 Intrinsic<[], 2920 [llvm_anyptr_ty, // Pointer to a constant/global memory 2921 llvm_i32_ty], // Length to prefetch 0-31 (1-32 chaunks, units of 128 bytes) 2922 [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], 2923 "", [SDNPMemOperand] 2924 >; 2925 2926// llvm.amdgcn.ds.bpermute.fi.b32 <index> <src> 2927def int_amdgcn_ds_bpermute_fi_b32 : 2928 ClangBuiltin<"__builtin_amdgcn_ds_bpermute_fi_b32">, 2929 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2930 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2931 2932//===----------------------------------------------------------------------===// 2933// Deep learning intrinsics. 2934//===----------------------------------------------------------------------===// 2935 2936// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) 2937// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2938def int_amdgcn_fdot2 : 2939 ClangBuiltin<"__builtin_amdgcn_fdot2">, 2940 DefaultAttrsIntrinsic< 2941 [llvm_float_ty], // %r 2942 [ 2943 llvm_v2f16_ty, // %a 2944 llvm_v2f16_ty, // %b 2945 llvm_float_ty, // %c 2946 llvm_i1_ty // %clamp 2947 ], 2948 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2949 >; 2950 2951// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) 2952// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2953def int_amdgcn_fdot2_f16_f16 : 2954 ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, 2955 DefaultAttrsIntrinsic< 2956 [llvm_half_ty], // %r 2957 [ 2958 llvm_v2f16_ty, // %a 2959 llvm_v2f16_ty, // %b 2960 llvm_half_ty // %c 2961 ], 2962 [IntrNoMem, IntrSpeculatable] 2963 >; 2964 2965// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) 2966// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2967def int_amdgcn_fdot2_bf16_bf16 : 2968 ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, 2969 DefaultAttrsIntrinsic< 2970 [llvm_bfloat_ty], // %r 2971 [ 2972 llvm_v2bf16_ty, // %a 2973 llvm_v2bf16_ty, // %b 2974 llvm_bfloat_ty // %c 2975 ], 2976 [IntrNoMem, IntrSpeculatable] 2977 >; 2978 2979// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2980// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2981def int_amdgcn_fdot2_f32_bf16 : 2982 ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, 2983 DefaultAttrsIntrinsic< 2984 [llvm_float_ty], // %r 2985 [ 2986 llvm_v2bf16_ty, // %a 2987 llvm_v2bf16_ty, // %b 2988 llvm_float_ty, // %c 2989 llvm_i1_ty // %clamp 2990 ], 2991 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2992 >; 2993 2994// f32 %r = llvm.amdgcn.fdot2c.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2995// %r = %a[0] * %b[0] + %a[1] * %b[1] + c 2996// TODO: This actually is similar to llvm.amdgcn.fdot2 intrinsics which produces 2997// v_dot2c_f32_f16 on gfx940. Maybe we can consolidate these. 2998 2999def int_amdgcn_fdot2c_f32_bf16 : 3000 ClangBuiltin<"__builtin_amdgcn_fdot2c_f32_bf16">, 3001 DefaultAttrsIntrinsic< 3002 [llvm_float_ty], // %r 3003 [ 3004 llvm_v2bf16_ty, // %a 3005 llvm_v2bf16_ty, // %b 3006 llvm_float_ty, // %c 3007 llvm_i1_ty // %clamp 3008 ], 3009 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 3010 >; 3011 3012// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) 3013// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 3014def int_amdgcn_sdot2 : 3015 ClangBuiltin<"__builtin_amdgcn_sdot2">, 3016 DefaultAttrsIntrinsic< 3017 [llvm_i32_ty], // %r 3018 [ 3019 llvm_v2i16_ty, // %a 3020 llvm_v2i16_ty, // %b 3021 llvm_i32_ty, // %c 3022 llvm_i1_ty // %clamp 3023 ], 3024 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 3025 >; 3026 3027// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) 3028// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 3029def int_amdgcn_udot2 : 3030 ClangBuiltin<"__builtin_amdgcn_udot2">, 3031 DefaultAttrsIntrinsic< 3032 [llvm_i32_ty], // %r 3033 [ 3034 llvm_v2i16_ty, // %a 3035 llvm_v2i16_ty, // %b 3036 llvm_i32_ty, // %c 3037 llvm_i1_ty // %clamp 3038 ], 3039 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 3040 >; 3041 3042// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) 3043// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 3044def int_amdgcn_sdot4 : 3045 ClangBuiltin<"__builtin_amdgcn_sdot4">, 3046 DefaultAttrsIntrinsic< 3047 [llvm_i32_ty], // %r 3048 [ 3049 llvm_i32_ty, // %a 3050 llvm_i32_ty, // %b 3051 llvm_i32_ty, // %c 3052 llvm_i1_ty // %clamp 3053 ], 3054 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 3055 >; 3056 3057// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) 3058// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 3059def int_amdgcn_udot4 : 3060 ClangBuiltin<"__builtin_amdgcn_udot4">, 3061 DefaultAttrsIntrinsic< 3062 [llvm_i32_ty], // %r 3063 [ 3064 llvm_i32_ty, // %a 3065 llvm_i32_ty, // %b 3066 llvm_i32_ty, // %c 3067 llvm_i1_ty // %clamp 3068 ], 3069 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 3070 >; 3071 3072// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) 3073// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 3074// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); 3075// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); 3076// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 3077def int_amdgcn_sudot4 : 3078 ClangBuiltin<"__builtin_amdgcn_sudot4">, 3079 DefaultAttrsIntrinsic< 3080 [llvm_i32_ty], // %r 3081 [ 3082 llvm_i1_ty, // %a_sign 3083 llvm_i32_ty, // %a 3084 llvm_i1_ty, // %b_sign 3085 llvm_i32_ty, // %b 3086 llvm_i32_ty, // %c 3087 llvm_i1_ty // %clamp 3088 ], 3089 [IntrNoMem, IntrSpeculatable, 3090 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 3091 >; 3092 3093// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) 3094// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 3095// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 3096def int_amdgcn_sdot8 : 3097 ClangBuiltin<"__builtin_amdgcn_sdot8">, 3098 DefaultAttrsIntrinsic< 3099 [llvm_i32_ty], // %r 3100 [ 3101 llvm_i32_ty, // %a 3102 llvm_i32_ty, // %b 3103 llvm_i32_ty, // %c 3104 llvm_i1_ty // %clamp 3105 ], 3106 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 3107 >; 3108 3109// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) 3110// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 3111// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 3112def int_amdgcn_udot8 : 3113 ClangBuiltin<"__builtin_amdgcn_udot8">, 3114 DefaultAttrsIntrinsic< 3115 [llvm_i32_ty], // %r 3116 [ 3117 llvm_i32_ty, // %a 3118 llvm_i32_ty, // %b 3119 llvm_i32_ty, // %c 3120 llvm_i1_ty // %clamp 3121 ], 3122 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 3123 >; 3124 3125// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) 3126// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 3127// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); 3128// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); 3129// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 3130// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 3131 def int_amdgcn_sudot8 : 3132 ClangBuiltin<"__builtin_amdgcn_sudot8">, 3133 DefaultAttrsIntrinsic< 3134 [llvm_i32_ty], // %r 3135 [ 3136 llvm_i1_ty, // %a_sign 3137 llvm_i32_ty, // %a 3138 llvm_i1_ty, // %b_sign 3139 llvm_i32_ty, // %b 3140 llvm_i32_ty, // %c 3141 llvm_i1_ty // %clamp 3142 ], 3143 [IntrNoMem, IntrSpeculatable, 3144 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 3145 >; 3146 3147// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c) 3148// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 3149class AMDGPU8bitFloatDot4Intrinsic : 3150 ClangBuiltin<!subst("int", "__builtin", NAME)>, 3151 DefaultAttrsIntrinsic< 3152 [llvm_float_ty], // %r 3153 [ 3154 llvm_i32_ty, // %a 3155 llvm_i32_ty, // %b 3156 llvm_float_ty, // %c 3157 ], 3158 [IntrNoMem, IntrSpeculatable] 3159 >; 3160 3161def int_amdgcn_dot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic; 3162def int_amdgcn_dot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic; 3163def int_amdgcn_dot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic; 3164def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic; 3165 3166//===----------------------------------------------------------------------===// 3167// gfx908 intrinsics 3168// ===----------------------------------------------------------------------===// 3169 3170// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp 3171class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : 3172 ClangBuiltin<!subst("int", "__builtin", NAME)>, 3173 DefaultAttrsIntrinsic<[DestTy], 3174 [SrcABTy, SrcABTy, DestTy, 3175 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3176 [IntrConvergent, IntrNoMem, 3177 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 3178 3179 3180// srcA's format is determined by cbsz. srcB's format is determined by 3181// blgp. 3182// 3183// These should be <8 x i32> for f8 formats, <6 x i32> for f6 formats, 3184// and <4 x i32> for f4 formats. It is invalid to use a format that 3185// requires more registers than the corresponding vector type (e.g. it 3186// is illegal to use <6 x i32> in operand 0 if cbsz specifies an f8 3187// format that requires 8 registers). 3188class AMDGPUMfmaScaleIntrinsic<LLVMType DestTy> : 3189 DefaultAttrsIntrinsic<[DestTy], 3190 [llvm_anyvector_ty, llvm_anyvector_ty, DestTy, 3191 llvm_i32_ty, // cbsz 3192 llvm_i32_ty, // blgp 3193 // llvm_i1_ty, // TODO: neg_src2 3194 // llvm_i1_ty, // TODO: abs_src2 3195 // llvm_i1_ty, // TODO: clamp 3196 llvm_i32_ty, // op_sel (A matrix scale, 2-bits) // TODO: Make i2? 3197 llvm_i32_ty, // v_mfma_ld_scale_b32 src0 (A matrix scale) 3198 llvm_i32_ty, // op_sel (B matrix scale, 2-bits) // TODO: Make i2? 3199 llvm_i32_ty // v_mfma_ld_scale_b32 src1 (B matrix scale) 3200 ], 3201 [IntrConvergent, IntrNoMem, 3202 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 3203 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<7>> 3204 ]>; 3205 3206defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = { 3207def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; 3208def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 3209def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 3210def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 3211def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 3212def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; 3213def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 3214def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 3215def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 3216def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 3217def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; 3218def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 3219def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 3220def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 3221def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 3222def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; 3223def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 3224def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 3225def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 3226def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 3227} 3228 3229//===----------------------------------------------------------------------===// 3230// gfx90a intrinsics 3231// ===----------------------------------------------------------------------===// 3232 3233defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = { 3234def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; 3235def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 3236def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 3237def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 3238def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 3239 3240// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. 3241// Three bits corresponding to the neg modifier applied to the respective 3242// source operand. 3243def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; 3244def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; 3245} 3246 3247//===----------------------------------------------------------------------===// 3248// gfx940 intrinsics 3249// ===----------------------------------------------------------------------===// 3250 3251class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : 3252 AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; 3253 3254multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { 3255 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 3256 def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; 3257} 3258 3259// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid 3260class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : 3261 ClangBuiltin<!subst("int", "__builtin", NAME)>, 3262 DefaultAttrsIntrinsic<[DestTy], 3263 [SrcA, SrcB, DestTy, llvm_i32_ty, 3264 llvm_i32_ty, llvm_i32_ty], 3265 [IntrConvergent, IntrNoMem, 3266 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 3267 3268class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : 3269 AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; 3270 3271multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { 3272 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 3273 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; 3274} 3275 3276defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = { 3277def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; 3278def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; 3279def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; 3280def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; 3281 3282defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; 3283defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; 3284 3285def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 3286def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 3287def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 3288def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 3289def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 3290def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 3291 3292defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; 3293defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; 3294} 3295 3296// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] 3297// byte_sel selects byte from srcA. 3298def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, 3299 DefaultAttrsIntrinsic<[llvm_float_ty], 3300 [llvm_i32_ty, llvm_i32_ty], 3301 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3302 3303// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] 3304def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, 3305 DefaultAttrsIntrinsic<[llvm_float_ty], 3306 [llvm_i32_ty, llvm_i32_ty], 3307 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3308 3309// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel 3310// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. 3311def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, 3312 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 3313 [llvm_i32_ty, llvm_i1_ty], 3314 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3315 3316// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. 3317def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, 3318 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 3319 [llvm_i32_ty, llvm_i1_ty], 3320 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3321 3322// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 3323// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. 3324def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, 3325 DefaultAttrsIntrinsic<[llvm_i32_ty], 3326 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 3327 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3328 3329// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 3330def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, 3331 DefaultAttrsIntrinsic<[llvm_i32_ty], 3332 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 3333 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3334 3335// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 3336// byte_sel selects byte to write into vdst. 3337def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, 3338 DefaultAttrsIntrinsic<[llvm_i32_ty], 3339 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3340 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3341 3342// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 3343def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, 3344 DefaultAttrsIntrinsic<[llvm_i32_ty], 3345 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3346 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3347 3348//===----------------------------------------------------------------------===// 3349// gfx950 intrinsics 3350//===----------------------------------------------------------------------===// 3351 3352defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = { 3353def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>; 3354def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>; 3355def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>; 3356def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>; 3357def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty>; 3358def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>; 3359def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>; 3360def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>; 3361def int_amdgcn_smfmac_f32_16x16x64_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>; 3362def int_amdgcn_smfmac_f32_32x32x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>; 3363def int_amdgcn_smfmac_f32_16x16x64_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v16bf16_ty>; 3364def int_amdgcn_smfmac_f32_32x32x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty, llvm_v16bf16_ty>; 3365def int_amdgcn_smfmac_i32_16x16x128_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3366def int_amdgcn_smfmac_i32_32x32x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3367def int_amdgcn_smfmac_f32_16x16x128_bf8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3368def int_amdgcn_smfmac_f32_16x16x128_bf8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3369def int_amdgcn_smfmac_f32_16x16x128_fp8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3370def int_amdgcn_smfmac_f32_16x16x128_fp8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3371def int_amdgcn_smfmac_f32_32x32x64_bf8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3372def int_amdgcn_smfmac_f32_32x32x64_bf8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3373def int_amdgcn_smfmac_f32_32x32x64_fp8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3374def int_amdgcn_smfmac_f32_32x32x64_fp8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>; 3375} 3376 3377// { vdst_new, vsrc_new } llvm.amdgcn.permlane16.swap <vdst_old> <vsrc_old> <fi> <bound_control> 3378def int_amdgcn_permlane16_swap : 3379 Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, 3380 llvm_i1_ty, llvm_i1_ty], 3381 [IntrNoMem, IntrConvergent, IntrWillReturn, 3382 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>; 3383 3384// { vdst_new, vsrc_new } llvm.amdgcn.permlane32.swap <vdst_old> <vsrc_old> <fi> <bound_control> 3385def int_amdgcn_permlane32_swap : 3386 Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, 3387 llvm_i1_ty, llvm_i1_ty], 3388 [IntrNoMem, IntrConvergent, IntrWillReturn, 3389 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>; 3390 3391// llvm.amdgcn.ashr_pk_i8_i32 int vdst, int src0, int src1 int src2 3392def int_amdgcn_ashr_pk_i8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_i8_i32">, 3393 DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3394 [IntrNoMem, IntrSpeculatable]>; 3395 3396// llvm.amdgcn.ashr_pk_u8_i32 int vdst, int src0, int src1 int src2 3397def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">, 3398 DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3399 [IntrNoMem, IntrSpeculatable]>; 3400 3401//===----------------------------------------------------------------------===// 3402// Special Intrinsics for backend internal use only. No frontend 3403// should emit calls to these. 3404// ===----------------------------------------------------------------------===// 3405// 3406// Control-flow intrinsics in LLVM IR are convergent because they represent the 3407// wave CFG, i.e., sets of threads that are "converged" or "execute in 3408// lock-step". But they exist during a small window in the lowering process, 3409// inserted after the structurizer and then translated to equivalent MIR 3410// pseudos. So rather than create convergence tokens for these builtins, we 3411// simply mark them as not convergent. 3412// 3413// This is really a workaround to allow control flow lowering in the presence of 3414// convergence control tokens. The corresponding MIR pseudos are marked as 3415// having side effects, which is sufficient to prevent optimizations without 3416// having to mark them as convergent. 3417def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 3418 [llvm_i1_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3419>; 3420 3421def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 3422 [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3423>; 3424 3425def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], 3426 [llvm_i1_ty, LLVMMatchType<0>], 3427 [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree] 3428>; 3429 3430def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], 3431 [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3432>; 3433 3434def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], 3435 [IntrWillReturn, IntrNoCallback, IntrNoFree]>; 3436 3437// Represent unreachable in a divergent region. 3438def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; 3439 3440// Emit 2.5 ulp, no denormal division. Should only be inserted by 3441// pass based on !fpmath metadata. 3442def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< 3443 [llvm_float_ty], [llvm_float_ty, llvm_float_ty], 3444 [IntrNoMem, IntrSpeculatable] 3445>; 3446 3447/// Emit an addrspacecast without null pointer checking. 3448/// Should only be inserted by a pass based on analysis of an addrspacecast's src. 3449def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic< 3450 [llvm_anyptr_ty], [llvm_anyptr_ty], 3451 [IntrNoMem, IntrSpeculatable] 3452>; 3453} 3454