1// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s 2 3llvm.func @rocdl_special_regs() -> i32 { 4 // CHECK-LABEL: rocdl_special_regs 5 // CHECK: call i32 @llvm.amdgcn.workitem.id.x() 6 %1 = rocdl.workitem.id.x : i32 7 // CHECK: call i32 @llvm.amdgcn.workitem.id.y() 8 %2 = rocdl.workitem.id.y : i32 9 // CHECK: call i32 @llvm.amdgcn.workitem.id.z() 10 %3 = rocdl.workitem.id.z : i32 11 // CHECK: call i32 @llvm.amdgcn.workgroup.id.x() 12 %4 = rocdl.workgroup.id.x : i32 13 // CHECK: call i32 @llvm.amdgcn.workgroup.id.y() 14 %5 = rocdl.workgroup.id.y : i32 15 // CHECK: call i32 @llvm.amdgcn.workgroup.id.z() 16 %6 = rocdl.workgroup.id.z : i32 17 // CHECK: call i64 @__ockl_get_local_size(i32 0) 18 %7 = rocdl.workgroup.dim.x : i64 19 // CHECK: call i64 @__ockl_get_local_size(i32 1) 20 %8 = rocdl.workgroup.dim.y : i64 21 // CHECK: call i64 @__ockl_get_local_size(i32 2) 22 %9 = rocdl.workgroup.dim.z : i64 23 // CHECK: call i64 @__ockl_get_num_groups(i32 0) 24 %10 = rocdl.grid.dim.x : i64 25 // CHECK: call i64 @__ockl_get_num_groups(i32 1) 26 %11 = rocdl.grid.dim.y : i64 27 // CHECK: call i64 @__ockl_get_num_groups(i32 2) 28 %12 = rocdl.grid.dim.z : i64 29 30 // CHECK: call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x() 31 %13 = rocdl.workitem.id.x range <i32, 0, 64> : i32 32 33 // CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0) 34 %14 = rocdl.workgroup.dim.x range <i32, 1, 65> : i64 35 llvm.return %1 : i32 36} 37 38llvm.func @kernel_func() attributes {rocdl.kernel} { 39 // CHECK-LABEL: amdgpu_kernel void @kernel_func() 40 // CHECK: #[[$KERNEL_ATTRS:[0-9]+]] 41 llvm.return 42} 43 44llvm.func @kernel_func_workgroups() 45 attributes {rocdl.kernel, rocdl.max_flat_work_group_size = 1024 : index} { 46 // CHECK-LABEL: amdgpu_kernel void @kernel_func_workgroups() 47 // CHECK: #[[$KERNEL_WORKGROUP_ATTRS:[0-9]+]] 48 llvm.return 49} 50 51llvm.func @known_block_sizes() 52 attributes {rocdl.kernel, 53 rocdl.flat_work_group_size = "128,128", 54 rocdl.reqd_work_group_size = array<i32: 16, 4, 2>} { 55 // CHECK-LABEL: amdgpu_kernel void @known_block_sizes() 56 // CHECK: #[[$KNOWN_BLOCK_SIZE_ATTRS:[0-9]+]] 57 // CHECK: !reqd_work_group_size ![[$REQD_WORK_GROUP_SIZE:[0-9]+]] 58 llvm.return 59} 60 61llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} { 62 // CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups() 63 // CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]] 64 llvm.return 65} 66 67llvm.func @kernel_func_waves_per_eu() 68 attributes {rocdl.kernel, rocdl.waves_per_eu = 2 : i32} { 69 // CHECK-LABEL: amdgpu_kernel void @kernel_func_waves_per_eu() 70 // CHECK: #[[$KERNEL_WAVES_PER_EU_ATTR:[0-9]+]] 71 llvm.return 72} 73 74llvm.func @kernel_func_unsafe_fp_atomics() 75 attributes {rocdl.kernel, rocdl.unsafe_fp_atomics = true} { 76 // CHECK-LABEL: amdgpu_kernel void @kernel_func_unsafe_fp_atomics() 77 // CHECK: #[[$KERNEL_UNSAFE_FP_ATOMICS_ATTR:[0-9]+]] 78 llvm.return 79} 80 81llvm.func @rocdl.lane_id() -> i32 { 82 // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 83 // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]]) 84 %0 = llvm.mlir.constant(-1 : i32) : i32 85 %1 = llvm.mlir.constant(0 : i32) : i32 86 %2 = rocdl.mbcnt.lo %0, %1 : (i32, i32) -> i32 87 %3 = rocdl.mbcnt.hi %0, %2 : (i32, i32) -> i32 88 llvm.return %3 : i32 89} 90 91llvm.func @rocdl.swizzle(%src : i32) -> i32 { 92 // CHECK-LABEL: rocdl.swizzle 93 // CHECK: call i32 @llvm.amdgcn.ds.swizzle 94 %offset = llvm.mlir.constant(100 : i32) : i32 95 %0 = rocdl.ds_swizzle %src, %offset : (i32, i32) -> i32 96 llvm.return %0 : i32 97} 98 99llvm.func @rocdl.bpermute(%src : i32) -> i32 { 100 // CHECK-LABEL: rocdl.bpermute 101 // CHECK: call i32 @llvm.amdgcn.ds.bpermute 102 %index = llvm.mlir.constant(10 : i32) : i32 103 %0 = rocdl.ds_bpermute %index, %src : (i32, i32) -> i32 104 llvm.return %0 : i32 105} 106 107llvm.func @rocdl.ballot32(%pred : i1) -> i32 { 108 // CHECK-LABEL: rocdl.ballot32 109 // CHECK: call i32 @llvm.amdgcn.ballot 110 %0 = rocdl.ballot %pred : i32 111 llvm.return %0 : i32 112} 113 114llvm.func @rocdl.ballot64(%pred : i1) -> i64 { 115 // CHECK-LABEL: rocdl.ballot64 116 // CHECK: call i64 @llvm.amdgcn.ballot 117 %0 = rocdl.ballot %pred : i64 118 llvm.return %0 : i64 119} 120 121llvm.func @rocdl.readlane(%src0 : f32, %src1: f64, %src2: i32, %src3: vector<2 x f32>) -> f32 { 122 %idx = llvm.mlir.constant(0 : i32) : i32 123 124 // CHECK-LABEL: rocdl.readlane 125 // CHECK: call float @llvm.amdgcn.readlane.f32(float %{{.*}}, i32 0) 126 %0 = rocdl.readlane %src0, %idx : (f32, i32) -> f32 127 128 // CHECK: call double @llvm.amdgcn.readlane.f64(double %{{.*}}, i32 0) 129 %1 = rocdl.readlane %src1, %idx : (f64, i32) -> f64 130 131 // CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %{{.*}}, i32 0) 132 %2 = rocdl.readlane %src2, %idx : (i32, i32) -> i32 133 134 // CHECK: call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %{{.*}}, i32 0) 135 %3 = rocdl.readlane %src3, %idx : (vector<2 x f32>, i32) -> vector<2 x f32> 136 137 llvm.return %0 : f32 138} 139 140llvm.func @rocdl.waitcnt() { 141 // CHECK-LABEL: rocdl.waitcnt 142 // CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0) 143 rocdl.waitcnt 0 144 llvm.return 145} 146 147llvm.func @rocdl.s.barrier() { 148 // CHECK-LABEL: rocdl.s.barrier 149 // CHECK-NEXT: call void @llvm.amdgcn.s.barrier() 150 rocdl.s.barrier 151 llvm.return 152} 153 154 155llvm.func @rocdl.barrier() { 156 // CHECK-LABEL: rocdl.barrier 157 // CHECK: fence syncscope("workgroup") release 158 // CHECK-NEXT: call void @llvm.amdgcn.s.barrier() 159 // CHECK-NEXT: fence syncscope("workgroup") acquire 160 rocdl.barrier 161 llvm.return 162} 163 164llvm.func @rocdl.s.barrier.signal() { 165 // CHECK-LABEL: rocdl.s.barrier.signal 166 // CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) 167 rocdl.s.barrier.signal -1 168 llvm.return 169} 170 171llvm.func @rocdl.s.barrier.wait() { 172 // CHECK-LABEL: rocdl.s.barrier.wait 173 // CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) 174 rocdl.s.barrier.wait -1 175 llvm.return 176} 177 178llvm.func @rocdl.s.wait.dscnt() { 179 // CHECK-LABEL: rocdl.s.wait.dscnt 180 // CHECK-NEXT: call void @llvm.amdgcn.s.wait.dscnt(i16 0) 181 rocdl.s.wait.dscnt 0 182 llvm.return 183} 184 185llvm.func @rocdl.setprio() { 186 // CHECK: call void @llvm.amdgcn.s.setprio(i16 0) 187 rocdl.s.setprio 0 188 // CHECK-NEXT: call void @llvm.amdgcn.s.setprio(i16 1) 189 rocdl.s.setprio 1 190 llvm.return 191} 192 193llvm.func @rocdl.schedbarrier() { 194 // CHECK: call void @llvm.amdgcn.sched.barrier(i32 0) 195 rocdl.sched.barrier 0 196 // CHECK-NEXT: call void @llvm.amdgcn.sched.barrier(i32 1) 197 rocdl.sched.barrier 1 198 llvm.return 199} 200 201llvm.func @rocdl.sched.group.barrier() { 202 // CHECK-LABEL: rocdl.sched.group.barrier 203 // CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 204 rocdl.sched.group.barrier 8, 1, 0 205 llvm.return 206} 207 208llvm.func @rocdl.iglp.opt() { 209 // CHECK-LABEL: rocdl.iglp.opt 210 // CHECK-NEXT: call void @llvm.amdgcn.iglp.opt(i32 0) 211 rocdl.iglp.opt 0 212 // CHECK-NEXT: call void @llvm.amdgcn.iglp.opt(i32 1) 213 rocdl.iglp.opt 1 214 llvm.return 215} 216 217llvm.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32, 218 %arg2 : vector<32 x f32>, %arg3: i32, 219 %arg4 : vector<16 x f32>, %arg5 : vector<4xf32>, 220 %arg6 : vector<4xf16>, %arg7 : vector<32 x i32>, 221 %arg8 : vector<16 x i32>, %arg9 : vector<4xi32>, 222 %arg10 : vector<2xi16>, %arg11 : i64, 223 %arg12 : vector<8xbf16>, %arg13 : vector<4xi32>, 224 %arg14 : vector<8xf16>) -> vector<32 x f32> { 225 %csti32 = llvm.mlir.constant(42 : i32) : i32 226 227 // CHECK-LABEL: rocdl.xdlops 228 // CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %{{.*}}, float %{{.*}}, <32 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 229 %r0 = rocdl.mfma.f32.32x32x1f32 %arg0, %arg1, %arg2, %csti32, %csti32, %csti32 : 230 (f32, f32, vector<32 x f32>, 231 i32, i32, i32) -> vector<32 x f32> 232 233 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %{{.*}}, float %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 234 %r1 = rocdl.mfma.f32.16x16x1f32 %arg0, %arg1, %arg4, %csti32, %csti32, %csti32 : 235 (f32, f32, vector<16 x f32>, 236 i32, i32, i32) -> vector<16 x f32> 237 238 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %{{.*}}, float %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 239 %r2 = rocdl.mfma.f32.16x16x4f32 %arg0, %arg1, %arg5, %csti32, %csti32, %csti32 : 240 (f32, f32, vector<4xf32>, 241 i32, i32, i32) -> vector<4xf32> 242 243 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %{{.*}}, float %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 244 %r3 = rocdl.mfma.f32.4x4x1f32 %arg0, %arg1, %arg5, %csti32, %csti32, %csti32 : 245 (f32, f32, vector<4xf32>, 246 i32, i32, i32) -> vector<4xf32> 247 248 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %{{.*}}, float %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 249 %r4= rocdl.mfma.f32.32x32x2f32 %arg0, %arg1, %arg4, %csti32, %csti32, %csti32 : 250 (f32, f32, vector<16 x f32>, 251 i32, i32, i32) -> vector<16 x f32> 252 253 // CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <32 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 254 %r5 = rocdl.mfma.f32.32x32x4f16 %arg6, %arg6, %arg2, %csti32, %csti32, %csti32 : 255 (vector<4xf16>, vector<4xf16>, vector<32 x f32>, 256 i32, i32, i32) -> vector<32 x f32> 257 258 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 259 %r6 = rocdl.mfma.f32.16x16x4f16 %arg6, %arg6, %arg4, %csti32, %csti32, %csti32 : 260 (vector<4xf16>, vector<4xf16>, vector<16 x f32>, 261 i32, i32, i32) -> vector<16 x f32> 262 263 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 264 %r7 = rocdl.mfma.f32.4x4x4f16 %arg6, %arg6, %arg5, %csti32, %csti32, %csti32 : 265 (vector<4xf16>, vector<4xf16>, vector<4xf32>, 266 i32, i32, i32) -> vector<4xf32> 267 268 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 269 %r8 = rocdl.mfma.f32.32x32x8f16 %arg6, %arg6, %arg4, %csti32, %csti32, %csti32 : 270 (vector<4xf16>, vector<4xf16>, vector<16 x f32>, 271 i32, i32, i32) -> vector<16 x f32> 272 273 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 274 %r9 = rocdl.mfma.f32.16x16x16f16 %arg6, %arg6, %arg5, %csti32, %csti32, %csti32 : 275 (vector<4xf16>, vector<4xf16>, vector<4xf32>, 276 i32, i32, i32) -> vector<4xf32> 277 278 // CHECK: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %{{.*}}, i32 %{{.*}}, <32 x i32> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 279 %r10 = rocdl.mfma.i32.32x32x4i8 %arg3, %arg3, %arg7, %csti32, %csti32, %csti32 : 280 (i32, i32, vector<32 x i32>, 281 i32, i32, i32) -> vector<32 x i32> 282 283 // CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %{{.*}}, i32 %{{.*}}, <16 x i32> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 284 %r11 = rocdl.mfma.i32.16x16x4i8 %arg3, %arg3, %arg8, %csti32, %csti32, %csti32 : 285 (i32, i32, vector<16 x i32>, 286 i32, i32, i32) -> vector<16 x i32> 287 288 // CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 289 %r12 = rocdl.mfma.i32.4x4x4i8 %arg3, %arg3, %arg9, %csti32, %csti32, %csti32 : 290 (i32, i32, vector<4xi32>, 291 i32, i32, i32) -> vector<4xi32> 292 293 // CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %{{.*}}, i32 %{{.*}}, <16 x i32> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 294 %r13 = rocdl.mfma.i32.32x32x8i8 %arg3, %arg3, %arg8, %csti32, %csti32, %csti32 : 295 (i32, i32, vector<16 x i32>, 296 i32, i32, i32) -> vector<16 x i32> 297 298 // CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 299 %r14 = rocdl.mfma.i32.16x16x16i8 %arg3, %arg3, %arg9, %csti32, %csti32, %csti32 : 300 (i32, i32, vector<4xi32>, 301 i32, i32, i32) -> vector<4xi32> 302 303 // CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <32 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 304 %r15 = rocdl.mfma.f32.32x32x2bf16 %arg10, %arg10, %arg2, %csti32, %csti32, %csti32 : 305 (vector<2xi16>, vector<2xi16>, vector<32 x f32>, 306 i32, i32, i32) -> vector<32 x f32> 307 308 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 309 %r16 = rocdl.mfma.f32.16x16x2bf16 %arg10, %arg10, %arg4, %csti32, %csti32, %csti32 : 310 (vector<2xi16>, vector<2xi16>, vector<16 x f32>, 311 i32, i32, i32) -> vector<16 x f32> 312 313 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 314 %r17 = rocdl.mfma.f32.4x4x2bf16 %arg10, %arg10, %arg5, %csti32, %csti32, %csti32 : 315 (vector<2xi16>, vector<2xi16>, vector<4xf32>, 316 i32, i32, i32) -> vector<4xf32> 317 318 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 319 %r18 = rocdl.mfma.f32.32x32x4bf16 %arg10, %arg10, %arg4, %csti32, %csti32, %csti32 : 320 (vector<2xi16>, vector<2xi16>, vector<16 x f32>, 321 i32, i32, i32) -> vector<16 x f32> 322 323 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 324 %r19 = rocdl.mfma.f32.16x16x8bf16 %arg10, %arg10, %arg5, %csti32, %csti32, %csti32 : 325 (vector<2xi16>, vector<2xi16>, vector<4xf32>, 326 i32, i32, i32) -> vector<4xf32> 327 328 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %{{.*}}, i64 %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 329 %r20 = rocdl.mfma.f32.16x16x32.bf8.bf8 %arg11, %arg11, %arg5, %csti32, %csti32, %csti32 : 330 (i64, i64, vector<4xf32>, 331 i32, i32, i32) -> vector<4xf32> 332 333 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %{{.*}}, i64 %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 334 %r21 = rocdl.mfma.f32.16x16x32.bf8.fp8 %arg11, %arg11, %arg5, %csti32, %csti32, %csti32 : 335 (i64, i64, vector<4xf32>, 336 i32, i32, i32) -> vector<4xf32> 337 338 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %{{.*}}, i64 %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 339 %r22 = rocdl.mfma.f32.16x16x32.fp8.bf8 %arg11, %arg11, %arg5, %csti32, %csti32, %csti32 : 340 (i64, i64, vector<4xf32>, 341 i32, i32, i32) -> vector<4xf32> 342 343 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %{{.*}}, i64 %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 344 %r23 = rocdl.mfma.f32.16x16x32.fp8.fp8 %arg11, %arg11, %arg5, %csti32, %csti32, %csti32 : 345 (i64, i64, vector<4xf32>, 346 i32, i32, i32) -> vector<4xf32> 347 348 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %{{.*}}, i64 %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 349 %r24 = rocdl.mfma.f32.32x32x16.bf8.bf8 %arg11, %arg11, %arg4, %csti32, %csti32, %csti32 : 350 (i64, i64, vector<16xf32>, 351 i32, i32, i32) -> vector<16xf32> 352 353 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %{{.*}}, i64 %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 354 %r25 = rocdl.mfma.f32.32x32x16.bf8.fp8 %arg11, %arg11, %arg4, %csti32, %csti32, %csti32 : 355 (i64, i64, vector<16xf32>, 356 i32, i32, i32) -> vector<16xf32> 357 358 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %{{.*}}, i64 %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 359 %r26 = rocdl.mfma.f32.32x32x16.fp8.bf8 %arg11, %arg11, %arg4, %csti32, %csti32, %csti32 : 360 (i64, i64, vector<16xf32>, 361 i32, i32, i32) -> vector<16xf32> 362 363 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %{{.*}}, i64 %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 364 %r27 = rocdl.mfma.f32.32x32x16.bf8.bf8 %arg11, %arg11, %arg4, %csti32, %csti32, %csti32 : 365 (i64, i64, vector<16xf32>, 366 i32, i32, i32) -> vector<16xf32> 367 368 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 369 %r28 = rocdl.mfma.f32.16x16x32.bf16 %arg12, %arg12, %arg5, %csti32, %csti32, %csti32 : 370 (vector<8xbf16>, vector<8xbf16>, vector<4xf32>, 371 i32, i32, i32) -> vector<4xf32> 372 373 // CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 374 %r29 = rocdl.mfma.i32.16x16x64.i8 %arg9, %arg9, %arg9, %csti32, %csti32, %csti32 : 375 (vector<4xi32>, vector<4xi32>, vector<4xi32>, 376 i32, i32, i32) -> vector<4xi32> 377 378 // CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <4 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 379 %r30 = rocdl.mfma.f32.16x16x32.f16 %arg14, %arg14, %arg5, %csti32, %csti32, %csti32 : 380 (vector<8xf16>, vector<8xf16>, vector<4xf32>, 381 i32, i32, i32) -> vector<4xi32> 382 383 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %1{{.*}}, <8 x bfloat> %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 384 %r31 = rocdl.mfma.f32.32x32x16.bf16 %arg12, %arg12, %arg4, %csti32, %csti32, %csti32 : 385 (vector<8xbf16>, vector<8xbf16>, vector<16xf32>, 386 i32, i32, i32) -> vector<16xf32> 387 388 // CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 389 %r32 = rocdl.mfma.i32.32x32x32.i8 %arg9, %arg9, %arg8, %csti32, %csti32, %csti32 : 390 (vector<4xi32>, vector<4xi32>, vector<16xi32>, 391 i32, i32, i32) -> vector<16xi32> 392 393 // CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <16 x float> %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}) 394 %r33 = rocdl.mfma.f32.32x32x16.f16 %arg14, %arg14, %arg4, %csti32, %csti32, %csti32 : 395 (vector<8xf16>, vector<8xf16>, vector<16xf32>, 396 i32, i32, i32) -> vector<16xf32> 397 398 llvm.return %r0 : vector<32 x f32> 399} 400 401llvm.func @rocdl.smfmac(%arg0 : i32, 402 %arg1 : vector<4 x f16>, 403 %arg2 : vector<8 x f16>, 404 %arg3 : vector<4 x f32>, 405 %arg4 : vector<16 x f32>, 406 %arg5 : vector<4 x i16>, 407 %arg6 : vector<8 x i16>, 408 %arg7 : vector<2xi32>, 409 %arg8 : vector<4xi32>, 410 %arg9 : vector<16xi32>) -> vector<4 x f32> { 411 %csti32 = llvm.mlir.constant(42 : i32) : i32 412 413 // CHECK-LABEL: rocdl.smfmac 414 415 // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %{{.*}}, <8 x half> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) 416 %r0 = rocdl.smfmac.f32.16x16x32.f16 %arg1, %arg2, %arg3, %csti32, %csti32, %csti32 : 417 (vector<4xf16>, vector<8xf16>, vector<4xf32>, 418 i32, i32, i32) -> vector<4xf32> 419 420 // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %{{.*}}, <8 x half> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) 421 %r1 = rocdl.smfmac.f32.32x32x16.f16 %arg1, %arg2, %arg4, %csti32, %csti32, %csti32 : 422 (vector<4xf16>, vector<8xf16>, vector<16xf32>, 423 i32, i32, i32) -> vector<16xf32> 424 425 // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) 426 %r2 = rocdl.smfmac.f32.16x16x32.bf16 %arg5, %arg6, %arg3, %csti32, %csti32, %csti32 : 427 (vector<4xi16>, vector<8xi16>, vector<4xf32>, 428 i32, i32, i32) -> vector<4xf32> 429 430 // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) 431 %r3 = rocdl.smfmac.f32.32x32x16.bf16 %arg5, %arg6, %arg4, %csti32, %csti32, %csti32 : 432 (vector<4xi16>, vector<8xi16>, vector<16xf32>, 433 i32, i32, i32) -> vector<16xf32> 434 435 // CHECK: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 42, i32 42, i32 42) 436 %r4 = rocdl.smfmac.i32.16x16x64.i8 %arg7, %arg8, %arg8, %csti32, %csti32, %csti32 : 437 (vector<2xi32>, vector<4xi32>, vector<4xi32>, 438 i32, i32, i32) -> vector<4xi32> 439 440 // CHECK: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> %{{.*}}, i32 42, i32 42, i32 42) 441 %r5 = rocdl.smfmac.i32.32x32x32.i8 %arg7, %arg8, %arg9, %csti32, %csti32, %csti32 : 442 (vector<2xi32>, vector<4xi32>, vector<16xi32>, 443 i32, i32, i32) -> vector<16xi32> 444 445 // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) 446 %r6 = rocdl.smfmac.f32.16x16x64.bf8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : 447 (vector<2xi32>, vector<4xi32>, vector<4xf32>, 448 i32, i32, i32) -> vector<4xf32> 449 450 // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) 451 %r7 = rocdl.smfmac.f32.16x16x64.bf8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : 452 (vector<2xi32>, vector<4xi32>, vector<4xf32>, 453 i32, i32, i32) -> vector<4xf32> 454 455 // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) 456 %r8 = rocdl.smfmac.f32.16x16x64.fp8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : 457 (vector<2xi32>, vector<4xi32>, vector<4xf32>, 458 i32, i32, i32) -> vector<4xf32> 459 460 // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) 461 %r9 = rocdl.smfmac.f32.16x16x64.fp8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : 462 (vector<2xi32>, vector<4xi32>, vector<4xf32>, 463 i32, i32, i32) -> vector<4xf32> 464 465 // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) 466 %r10 = rocdl.smfmac.f32.32x32x32.bf8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : 467 (vector<2xi32>, vector<4xi32>, vector<16xf32>, 468 i32, i32, i32) -> vector<16xf32> 469 470 // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) 471 %r11 = rocdl.smfmac.f32.32x32x32.bf8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : 472 (vector<2xi32>, vector<4xi32>, vector<16xf32>, 473 i32, i32, i32) -> vector<16xf32> 474 475 // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) 476 %r12 = rocdl.smfmac.f32.32x32x32.fp8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : 477 (vector<2xi32>, vector<4xi32>, vector<16xf32>, 478 i32, i32, i32) -> vector<16xf32> 479 480 481 // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) 482 %r13 = rocdl.smfmac.f32.32x32x32.fp8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : 483 (vector<2xi32>, vector<4xi32>, vector<16xf32>, 484 i32, i32, i32) -> vector<16xf32> 485 486 llvm.return %r0 : vector<4 x f32> 487} 488 489 490llvm.func @rocdl.mfma.scale.f32.32x32x64.f8f6f4(%arg0 : i32, 491 %arg1 : vector<16 x f32>, %arg2 : vector<8xi32>, 492 %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) -> vector<16 x f32> { 493 %cst0 = llvm.mlir.constant(0 : i32) : i32 494 %cst1 = llvm.mlir.constant(1 : i32) : i32 495 %cst2 = llvm.mlir.constant(2 : i32) : i32 496 %cst3 = llvm.mlir.constant(3 : i32) : i32 497 %cst4 = llvm.mlir.constant(4 : i32) : i32 498 499 // CHECK-LABEL: rocdl.mfma.scale.f32.32x32x64.f8f6f4 500 // fp8 * fp8 501 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 502 %r00 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst0, %cst0, %arg0, %cst0, %arg0 : 503 (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 504 505 // fp8 * bf8 506 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 507 %r01 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst1, %cst0, %arg0, %cst0, %arg0 : 508 (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 509 510 // fp8 * fp6 511 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 512 %r02 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst2, %cst0, %arg0, %cst0, %arg0 : 513 (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 514 515 // fp8 * bf6 516 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 517 %r03 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst3, %cst0, %arg0, %cst0, %arg0 : 518 (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 519 520 // fp8 * fp4 521 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 0, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 522 %r04 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg4, %arg1, %cst0, %cst4, %cst0, %arg0, %cst0, %arg0 : 523 (vector<8xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 524 525 // bf8 * fp8 526 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 527 %r10 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst0, %cst0, %arg0, %cst0, %arg0 : 528 (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 529 530 // bf8 * bf8 531 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 532 %r11 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst1, %cst0, %arg0, %cst0, %arg0 : 533 (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 534 535 // bf8 * fp6 536 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 537 %r12 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst2, %cst0, %arg0, %cst0, %arg0 : 538 (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 539 540 // bf8 * bf6 541 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 542 %r13 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst3, %cst0, %arg0, %cst0, %arg0 : 543 (vector<8xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 544 545 // bf8 * fp4 546 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 1, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 547 %r14 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg2, %arg4, %arg1, %cst1, %cst4, %cst0, %arg0, %cst0, %arg0 : 548 (vector<8xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 549 550 // fp6 * fp8 551 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 552 %r20 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst0, %cst0, %arg0, %cst0, %arg0 : 553 (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 554 555 // fp6 * bf8 556 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 557 %r21 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst1, %cst0, %arg0, %cst0, %arg0 : 558 (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 559 560 // fp6 * fp6 561 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 562 %r22 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst2, %cst0, %arg0, %cst0, %arg0 : 563 (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 564 565 // fp6 * bf6 566 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 567 %r23 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst3, %cst0, %arg0, %cst0, %arg0 : 568 (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 569 570 // fp6 * fp4 571 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 2, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 572 %r24 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg4, %arg1, %cst2, %cst4, %cst0, %arg0, %cst0, %arg0 : 573 (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 574 575 // bf6 * fp8 576 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 577 %r30 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst0, %cst0, %arg0, %cst0, %arg0 : 578 (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 579 580 // bf6 * bf8 581 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 582 %r31 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst1, %cst0, %arg0, %cst0, %arg0 : 583 (vector<6xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 584 585 // bf6 * fp6 586 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 587 %r32 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst2, %cst0, %arg0, %cst0, %arg0 : 588 (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 589 590 // bf6 * bf6 591 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 592 %r33 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst3, %cst0, %arg0, %cst0, %arg0 : 593 (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 594 595 // bf6 * fp4 596 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 3, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 597 %r34 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg3, %arg4, %arg1, %cst3, %cst4, %cst0, %arg0, %cst0, %arg0 : 598 (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 599 600 // fp4 * fp8 601 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 602 %r40 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst0, %cst0, %arg0, %cst0, %arg0 : 603 (vector<4xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 604 605 // fp4 * bf8 606 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 607 %r41 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst1, %cst0, %arg0, %cst0, %arg0 : 608 (vector<4xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 609 610 // fp4 * fp6 611 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 612 %r42 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst2, %cst0, %arg0, %cst0, %arg0 : 613 (vector<4xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 614 615 // fp4 * bf6 616 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %{{.*}}, <6 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 617 %r43 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst3, %cst0, %arg0, %cst0, %arg0 : 618 (vector<4xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 619 620 // fp4 * fp4 621 // CHECK: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 4, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 622 %r44 = rocdl.mfma.scale.f32.32x32x64.f8f6f4 %arg4, %arg4, %arg1, %cst4, %cst4, %cst0, %arg0, %cst0, %arg0 : 623 (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> 624 625 llvm.return %r00 : vector<16 x f32> 626} 627 628llvm.func @rocdl.mfma.scale.f32.16x16x128.f8f6f4(%arg0 : i32, 629 %arg1 : vector<4 x f32>, %arg2 : vector<8xi32>, 630 %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) -> vector<4 x f32> { 631 %cst0 = llvm.mlir.constant(0 : i32) : i32 632 %cst1 = llvm.mlir.constant(1 : i32) : i32 633 %cst2 = llvm.mlir.constant(2 : i32) : i32 634 %cst3 = llvm.mlir.constant(3 : i32) : i32 635 %cst4 = llvm.mlir.constant(4 : i32) : i32 636 637 // CHECK-LABEL: rocdl.mfma.scale.f32.16x16x128.f8f6f4 638 // fp8 * fp8 639 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 640 %r00 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst0, %cst0, %arg0, %cst0, %arg0 : 641 (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 642 643 // fp8 * bf8 644 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 645 %r01 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst0, %cst1, %cst0, %arg0, %cst0, %arg0 : 646 (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 647 648 // fp8 * fp6 649 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 650 %r02 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst2, %cst0, %arg0, %cst0, %arg0 : 651 (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 652 653 // fp8 * bf6 654 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 655 %r03 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst0, %cst3, %cst0, %arg0, %cst0, %arg0 : 656 (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 657 658 // fp8 * fp4 659 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 0, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 660 %r04 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg4, %arg1, %cst0, %cst4, %cst0, %arg0, %cst0, %arg0 : 661 (vector<8xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 662 663 // bf8 * fp8 664 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 665 %r10 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst0, %cst0, %arg0, %cst0, %arg0 : 666 (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 667 668 // bf8 * bf8 669 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 670 %r11 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg2, %arg1, %cst1, %cst1, %cst0, %arg0, %cst0, %arg0 : 671 (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 672 673 // bf8 * fp6 674 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 675 %r12 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst2, %cst0, %arg0, %cst0, %arg0 : 676 (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 677 678 // bf8 * bf6 679 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 680 %r13 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg3, %arg1, %cst1, %cst3, %cst0, %arg0, %cst0, %arg0 : 681 (vector<8xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 682 683 // bf8 * fp4 684 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 1, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 685 %r14 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg2, %arg4, %arg1, %cst1, %cst4, %cst0, %arg0, %cst0, %arg0 : 686 (vector<8xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 687 688 // fp6 * fp8 689 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 690 %r20 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst0, %cst0, %arg0, %cst0, %arg0 : 691 (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 692 693 // fp6 * bf8 694 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 695 %r21 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst2, %cst1, %cst0, %arg0, %cst0, %arg0 : 696 (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 697 698 // fp6 * fp6 699 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 700 %r22 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst2, %cst0, %arg0, %cst0, %arg0 : 701 (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 702 703 // fp6 * bf6 704 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 705 %r23 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst2, %cst3, %cst0, %arg0, %cst0, %arg0 : 706 (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 707 708 // fp6 * fp4 709 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 2, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 710 %r24 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg4, %arg1, %cst2, %cst4, %cst0, %arg0, %cst0, %arg0 : 711 (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 712 713 // bf6 * fp8 714 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 715 %r30 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst0, %cst0, %arg0, %cst0, %arg0 : 716 (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 717 718 // bf6 * bf8 719 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 720 %r31 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg2, %arg1, %cst3, %cst1, %cst0, %arg0, %cst0, %arg0 : 721 (vector<6xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 722 723 // bf6 * fp6 724 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 725 %r32 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst2, %cst0, %arg0, %cst0, %arg0 : 726 (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 727 728 // bf6 * bf6 729 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 730 %r33 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg3, %arg1, %cst3, %cst3, %cst0, %arg0, %cst0, %arg0 : 731 (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 732 733 // bf6 * fp4 734 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 3, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 735 %r34 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg3, %arg4, %arg1, %cst3, %cst4, %cst0, %arg0, %cst0, %arg0 : 736 (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 737 738 // fp4 * fp8 739 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 0, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 740 %r40 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst0, %cst0, %arg0, %cst0, %arg0 : 741 (vector<4xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 742 743 // fp4 * bf8 744 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 1, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 745 %r41 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg2, %arg1, %cst4, %cst1, %cst0, %arg0, %cst0, %arg0 : 746 (vector<4xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 747 748 // fp4 * fp6 749 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 2, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 750 %r42 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst2, %cst0, %arg0, %cst0, %arg0 : 751 (vector<4xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 752 753 // fp4 * bf6 754 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %{{.*}}, <6 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 3, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}}) 755 %r43 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg3, %arg1, %cst4, %cst3, %cst0, %arg0, %cst0, %arg0 : 756 (vector<4xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 757 758 // fp4 * fp4 759 // CHECK: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 4, i32 4, i32 0, i32 %{{.*}}, i32 0, i32 %{{.*}} 760 %r44 = rocdl.mfma.scale.f32.16x16x128.f8f6f4 %arg4, %arg4, %arg1, %cst4, %cst4, %cst0, %arg0, %cst0, %arg0 : 761 (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> 762 763 llvm.return %r00 : vector<4 x f32> 764} 765 766llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : vector<16 x i16>, %arg3 : vector<8 x i32>, 767 %arg4 : vector<2xi32>, %arg5 : vector<4xi32>, %arg6 : vector<4xf32>, %arg7 : vector<8xf16>, %arg8 : vector<8xi16>) -> vector<8xf32> { 768 %zero = llvm.mlir.constant(false) : i1 769 770 // ---- Wave32 ----- 771 772 // f16 -> f32 773 // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}}) 774 %r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32> 775 776 // bf16 -> f32 777 // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}}) 778 %r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32> 779 780 // f16 -> f16 (OPSEL = {0,1}) 781 // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}}) 782 %r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16> 783 784 // bf16 -> bf16 (OPSEL = {0,1}) 785 // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}}) 786 %r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16> 787 788 // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) 789 // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) 790 %r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32> 791 792 // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) 793 // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) 794 %r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32> 795 796 // ---- Wave64 ----- 797 798 // f16 -> f32 799 // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}}) 800 %r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32> 801 802 // bf16 -> f32 803 // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}}) 804 %r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32> 805 806 // f16 -> f16 (OPSEL = {0,1}) 807 // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}}) 808 %r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16> 809 810 // bf16 -> bf16 (OPSEL = {0,1}) 811 // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}}) 812 %r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16> 813 814 // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) 815 // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) 816 %r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32> 817 818 // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) 819 // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) 820 %r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32> 821 822 llvm.return %r0 : vector<8xf32> 823} 824 825llvm.func @rocdl.ds.read.tr(%ptr : !llvm.ptr<3>) -> vector<4xf16> { 826 // CHECK-LABEL: rocdl.ds.read.tr 827 // CHECK: call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %0) 828 %r0 = rocdl.ds.read.tr4.b64 %ptr : !llvm.ptr<3> -> vector<2xi32> 829 // CHECK: call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %0) 830 %r1 = rocdl.ds.read.tr6.b96 %ptr : !llvm.ptr<3> -> vector<3xi32> 831 // CHECK: call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %0) 832 %r2 = rocdl.ds.read.tr8.b64 %ptr : !llvm.ptr<3> -> vector<2xi32> 833 // CHECK: call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %0) 834 %r3 = rocdl.ds.read.tr16.b64 %ptr : !llvm.ptr<3> -> vector<4xf16> 835 // CHECK: call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %0) 836 %r4 = rocdl.ds.read.tr16.b64 %ptr : !llvm.ptr<3> -> vector<4xbf16> 837 llvm.return %r3 : vector<4xf16> 838} 839 840llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { 841 %aux = llvm.mlir.constant(0 : i32) : i32 842 %offset = llvm.mlir.constant(0 : i32) : i32 843 %size = llvm.mlir.constant(10 : i32) : i32 844 //CHECK: call void @llvm.amdgcn.global.load.lds 845 rocdl.global.load.lds %src, %dst, %size, %offset, %aux 846 llvm.return 847} 848 849llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr, 850 %stride : i16, 851 %numRecords : i32, 852 %flags : i32) -> !llvm.ptr<8> { 853 // CHECK-LABEL: rocdl.make.buffer.rsrc 854 // CHECK: %[[rsrc:.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %{{.*}}, i16 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) 855 // CHECK: ret ptr addrspace(8) %[[rsrc]] 856 %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : !llvm.ptr to !llvm.ptr<8> 857 llvm.return %rsrc : !llvm.ptr<8> 858} 859 860llvm.func @rocdl.wmma.fp8(%arg0 : vector<2 x i32>, %arg1 : vector<8xf32>) -> vector<8xf32> { 861 // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <8 x float> %{{.*}}) 862 %r0 = rocdl.wmma.f32.16x16x16.fp8_fp8 %arg0, %arg0, %arg1: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32> 863 864 // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <8 x float> %{{.*}}) 865 %r1 = rocdl.wmma.f32.16x16x16.bf8_bf8 %arg0, %arg0, %arg1: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32> 866 867 llvm.return %r0 : vector<8 x f32> 868} 869 870llvm.func @rocdl.raw.ptr.buffer(%rsrc : !llvm.ptr<8>, 871 %offset : i32, %soffset : i32, 872 %vdata1 : i32, 873 %vdata2 : vector<2xi32>, 874 %vdata4 : vector<4xi32>) { 875 %aux = llvm.mlir.constant(0 : i32) : i32 876 // CHECK-LABEL: rocdl.raw.ptr.buffer 877 // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 878 // CHECK: call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 879 // CHECK: call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 880 881 // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 882 // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 883 // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 884 885 %r1 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : i32 886 %r2 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xi32> 887 %r4 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xi32> 888 889 rocdl.raw.ptr.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : i32 890 rocdl.raw.ptr.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xi32> 891 rocdl.raw.ptr.buffer.store %vdata4, %rsrc, %offset, %soffset, %aux : vector<4xi32> 892 893 llvm.return 894} 895 896llvm.func @rocdl.raw.ptr.buffer.atomic.f32(%rsrc : !llvm.ptr<8>, 897 %offset : i32, %soffset : i32, 898 %vdata1 : f32) { 899 %aux = llvm.mlir.constant(0 : i32) : i32 900 // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.f32 901 // CHECK: call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 902 // CHECK: call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 903 904 rocdl.raw.ptr.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 905 rocdl.raw.ptr.buffer.atomic.fmax %vdata1, %rsrc, %offset, %soffset, %aux : f32 906 907 llvm.return 908} 909 910llvm.func @rocdl.raw.ptr.buffer.atomic.i32(%rsrc : !llvm.ptr<8>, 911 %offset : i32, %soffset : i32, 912 %vdata1 : i32) { 913 %aux = llvm.mlir.constant(0 : i32) : i32 914 // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.i32 915 // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smax.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 916 // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umin.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 917 918 rocdl.raw.ptr.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32 919 rocdl.raw.ptr.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32 920 921 llvm.return 922} 923 924llvm.func @rocdl.raw.ptr.buffer.atomic.cmpswap(%rsrc : !llvm.ptr<8>, 925 %offset : i32, %soffset : i32, 926 %src : i32, %cmp : i32) -> i32 { 927 %aux = llvm.mlir.constant(0 : i32) : i32 928 // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.cmpswap 929 // CHECK: [[val:%.+]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 %{{.*}}, i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 930 // CHECK: ret i32 [[val]] 931 932 %val = rocdl.raw.ptr.buffer.atomic.cmpswap %src, %cmp, %rsrc, %offset, %soffset, %aux : i32 933 llvm.return %val : i32 934} 935 936llvm.func @rocdl.raw.buffer(%rsrc : vector<4xi32>, 937 %offset : i32, %soffset : i32, 938 %vdata1 : i32, 939 %vdata2 : vector<2xi32>, 940 %vdata4 : vector<4xi32>) { 941 %aux = llvm.mlir.constant(0 : i32) : i32 942 // CHECK-LABEL: rocdl.raw.buffer 943 // CHECK: call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 944 // CHECK: call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 945 // CHECK: call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 946 947 // CHECK: call void @llvm.amdgcn.raw.buffer.store.i32(i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 948 // CHECK: call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 949 // CHECK: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 950 951 %r1 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : i32 952 %r2 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xi32> 953 %r4 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xi32> 954 955 rocdl.raw.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : i32 956 rocdl.raw.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xi32> 957 rocdl.raw.buffer.store %vdata4, %rsrc, %offset, %soffset, %aux : vector<4xi32> 958 959 llvm.return 960} 961 962llvm.func @rocdl.raw.buffer.atomic.f32(%rsrc : vector<4xi32>, 963 %offset : i32, %soffset : i32, 964 %vdata1 : f32) { 965 %aux = llvm.mlir.constant(0 : i32) : i32 966 // CHECK-LABEL: rocdl.raw.buffer.atomic.f32 967 // CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 968 // CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 969 970 rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32 971 rocdl.raw.buffer.atomic.fmax %vdata1, %rsrc, %offset, %soffset, %aux : f32 972 973 llvm.return 974} 975 976llvm.func @rocdl.raw.buffer.atomic.i32(%rsrc : vector<4xi32>, 977 %offset : i32, %soffset : i32, 978 %vdata1 : i32) { 979 %aux = llvm.mlir.constant(0 : i32) : i32 980 // CHECK-LABEL: rocdl.raw.buffer.atomic.i32 981 // CHECK: call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 982 // CHECK: call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 983 984 rocdl.raw.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32 985 rocdl.raw.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32 986 987 llvm.return 988} 989 990llvm.func @rocdl.raw.buffer.atomic.cmpswap(%rsrc : vector<4xi32>, 991 %offset : i32, %soffset : i32, 992 %src : i32, %cmp : i32) -> i32 { 993 %aux = llvm.mlir.constant(0 : i32) : i32 994 // CHECK-LABEL: rocdl.raw.buffer.atomic.cmpswap 995 // CHECK: [[val:%.+]] = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}} 996 // CHECK: ret i32 [[val]] 997 998 %val = rocdl.raw.buffer.atomic.cmpswap(%src, %cmp, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32> 999 llvm.return %val : i32 1000} 1001 1002llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 { 1003// CHECK-LABEL: @rocdl_8bit_floats 1004// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %{{.+}}, i32 0) 1005// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %{{.+}}, i32 0) 1006// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %{{.+}}, float %{{.+}}, i32 %{{.+}}, i1 false) 1007// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %{{.+}}, float %{{.+}}, i32 %{{.+}}, i1 false) 1008// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %{{.+}}, i32 %{{.+}}, i32 %{{.+}}, i32 2) 1009// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %{{.+}}, i32 %{{.+}}, i32 %{{.+}}, i32 3) 1010 %c0 = llvm.mlir.constant(0 : i32) : i32 1011 %c2 = llvm.mlir.constant(2 : i32) : i32 1012 %c3 = llvm.mlir.constant(3 : i32) : i32 1013 %false = llvm.mlir.constant(false) : i1 1014 %v1 = rocdl.cvt.f32.bf8 %source[%c0] : f32 1015 %v2 = rocdl.cvt.f32.fp8 %source[%c0] : f32 1016 %source2 = rocdl.cvt.pk.bf8.f32 %v1, %v2 -> %source[%false] : i32 1017 %source3 = rocdl.cvt.pk.fp8.f32 %v1, %v2 -> %source2[%false] : i32 1018 %source4 = rocdl.cvt.sr.bf8.f32 %v1, %stoch -> %source3[%c2] : i32 1019 %source5 = rocdl.cvt.sr.fp8.f32 %v2, %stoch -> %source4[%c3] : i32 1020 llvm.return %source5 : i32 1021} 1022 1023llvm.func @rocdl_16bit_packed_floats(%sourceA: f32, %sourceB: f32) -> vector<2xf16> { 1024 // CHECK-LABEL: @rocdl_16bit_packed_floats 1025 // CHECK: call <2 x half> @llvm.amdgcn.cvt.pkrtz(float {{.*}}, float {{.*}}) 1026 %source = rocdl.cvt.pkrtz %sourceA, %sourceB : vector<2xf16> 1027 llvm.return %source : vector<2xf16> 1028} 1029 1030llvm.func @rocdl_atomic_attrs(%ptr: !llvm.ptr<1>, %data: f32) { 1031 // CHECK-LABEL: @rocdl_atomic_attrs 1032 // CHECK: atomicrmw 1033 // CHECK-SAME: !amdgpu.ignore.denormal.mode 1034 // CHECK-SAME: !amdgpu.no.fine.grained.memory 1035 // CHECK-SAME: !amdgpu.no.remote.memory 1036 llvm.atomicrmw fadd %ptr, %data monotonic { 1037 rocdl.ignore_denormal_mode, 1038 rocdl.no_fine_grained_memory, 1039 rocdl.no_remote_memory} : !llvm.ptr<1>, f32 1040 llvm.return 1041} 1042 1043llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 { 1044 // CHECK-LABEL: @rocdl_last_use 1045 // CHECK: %[[ret:.+]] = load 1046 // CHECK-SAME: !amdgpu.last.use 1047 // CHECK: ret i32 %[[ret]] 1048 %ret = llvm.load %ptr {rocdl.last_use} : !llvm.ptr<1> -> i32 1049 llvm.return %ret : i32 1050} 1051 1052// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" } 1053// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024" 1054// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128" 1055// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" } 1056// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2} 1057// CHECK-DAG: attributes #[[$KERNEL_WAVES_PER_EU_ATTR]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "uniform-work-group-size"="true" } 1058// CHECK-DAG: attributes #[[$KERNEL_UNSAFE_FP_ATOMICS_ATTR]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-unsafe-fp-atomics"="true" "uniform-work-group-size"="true" } 1059