1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 3 4define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 5; GFX12-LABEL: test_wmma_f32_16x16x16_f16: 6; GFX12: ; %bb.0: ; %bb 7; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] 8; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 9; GFX12-NEXT: s_endpgm 10bb: 11 %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) 12 store <4 x float> %res, ptr addrspace(1) %out 13 ret void 14} 15 16define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { 17; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: 18; GFX12: ; %bb.0: ; %bb 19; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] 20; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 21; GFX12-NEXT: s_endpgm 22bb: 23 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) 24 store <4 x float> %res, ptr addrspace(1) %out 25 ret void 26} 27 28define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { 29; GFX12-LABEL: test_wmma_f16_16x16x16_f16: 30; GFX12: ; %bb.0: ; %bb 31; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] 32; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off 33; GFX12-NEXT: s_endpgm 34bb: 35 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) 36 store <4 x half> %res, ptr addrspace(1) %out 37 ret void 38} 39 40define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { 41; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: 42; GFX12: ; %bb.0: ; %bb 43; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] 44; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off 45; GFX12-NEXT: s_endpgm 46bb: 47 %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) 48 store <4 x i16> %res, ptr addrspace(1) %out 49 ret void 50} 51 52define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 53; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: 54; GFX12: ; %bb.0: ; %bb 55; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] 56; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 57; GFX12-NEXT: s_endpgm 58bb: 59 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) 60 store <4 x i32> %res, ptr addrspace(1) %out 61 ret void 62} 63 64define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 65; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: 66; GFX12: ; %bb.0: ; %bb 67; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] 68; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 69; GFX12-NEXT: s_endpgm 70bb: 71 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) 72 store <4 x i32> %res, ptr addrspace(1) %out 73 ret void 74} 75 76define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 77; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: 78; GFX12: ; %bb.0: ; %bb 79; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] 80; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 81; GFX12-NEXT: s_endpgm 82bb: 83 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) 84 store <4 x float> %res, ptr addrspace(1) %out 85 ret void 86} 87 88define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 89; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: 90; GFX12: ; %bb.0: ; %bb 91; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] 92; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 93; GFX12-NEXT: s_endpgm 94bb: 95 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) 96 store <4 x float> %res, ptr addrspace(1) %out 97 ret void 98} 99 100define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 101; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: 102; GFX12: ; %bb.0: ; %bb 103; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] 104; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 105; GFX12-NEXT: s_endpgm 106bb: 107 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) 108 store <4 x float> %res, ptr addrspace(1) %out 109 ret void 110} 111 112define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 113; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: 114; GFX12: ; %bb.0: ; %bb 115; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] 116; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 117; GFX12-NEXT: s_endpgm 118bb: 119 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) 120 store <4 x float> %res, ptr addrspace(1) %out 121 ret void 122} 123 124define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 125; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: 126; GFX12: ; %bb.0: ; %bb 127; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] 128; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 129; GFX12-NEXT: s_endpgm 130bb: 131 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) 132 store <4 x i32> %res, ptr addrspace(1) %out 133 ret void 134} 135 136define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { 137; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: 138; GFX12: ; %bb.0: ; %bb 139; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 140; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off 141; GFX12-NEXT: s_endpgm 142bb: 143 %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) 144 store <4 x float> %res, ptr addrspace(1) %out 145 ret void 146} 147 148define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { 149; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: 150; GFX12: ; %bb.0: ; %bb 151; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 152; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off 153; GFX12-NEXT: s_endpgm 154bb: 155 %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) 156 store <4 x float> %res, ptr addrspace(1) %out 157 ret void 158} 159 160define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { 161; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: 162; GFX12: ; %bb.0: ; %bb 163; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 164; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off 165; GFX12-NEXT: s_endpgm 166bb: 167 %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) 168 store <4 x half> %res, ptr addrspace(1) %out 169 ret void 170} 171 172define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { 173; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: 174; GFX12: ; %bb.0: ; %bb 175; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 176; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off 177; GFX12-NEXT: s_endpgm 178bb: 179 %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) 180 store <4 x i16> %res, ptr addrspace(1) %out 181 ret void 182} 183 184define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { 185; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: 186; GFX12: ; %bb.0: ; %bb 187; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 188; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 189; GFX12-NEXT: s_endpgm 190bb: 191 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) 192 store <4 x i32> %res, ptr addrspace(1) %out 193 ret void 194} 195 196define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { 197; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: 198; GFX12: ; %bb.0: ; %bb 199; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 200; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off 201; GFX12-NEXT: s_endpgm 202bb: 203 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) 204 store <4 x i32> %res, ptr addrspace(1) %out 205 ret void 206} 207 208define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { 209; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: 210; GFX12: ; %bb.0: ; %bb 211; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 212; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 213; GFX12-NEXT: s_endpgm 214bb: 215 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) 216 store <4 x i32> %res, ptr addrspace(1) %out 217 ret void 218} 219 220define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { 221; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: 222; GFX12: ; %bb.0: ; %bb 223; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 224; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 225; GFX12-NEXT: s_endpgm 226bb: 227 %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) 228 store <4 x float> %res, ptr addrspace(1) %out 229 ret void 230} 231 232define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { 233; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: 234; GFX12: ; %bb.0: ; %bb 235; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 236; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 237; GFX12-NEXT: s_endpgm 238bb: 239 %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) 240 store <4 x float> %res, ptr addrspace(1) %out 241 ret void 242} 243 244define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { 245; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: 246; GFX12: ; %bb.0: ; %bb 247; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 248; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 249; GFX12-NEXT: s_endpgm 250bb: 251 %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) 252 store <4 x float> %res, ptr addrspace(1) %out 253 ret void 254} 255 256define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { 257; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: 258; GFX12: ; %bb.0: ; %bb 259; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 260; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 261; GFX12-NEXT: s_endpgm 262bb: 263 %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) 264 store <4 x float> %res, ptr addrspace(1) %out 265 ret void 266} 267 268declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) 269declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) 270declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) 271declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) 272declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 273declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 274declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) 275declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) 276declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) 277declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) 278declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 279declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) 280declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) 281declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) 282declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) 283declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) 284declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) 285declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) 286declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) 287declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) 288declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) 289declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) 290