1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 3 4define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { 5; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: 6; GFX12: ; %bb.0: ; %bb 7; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 8; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 9; GFX12-NEXT: s_endpgm 10bb: 11 %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>) 12 store <4 x float> %res, ptr addrspace(1) %out 13 ret void 14} 15 16define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { 17; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: 18; GFX12: ; %bb.0: ; %bb 19; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 20; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 21; GFX12-NEXT: v_mov_b32_e32 v7, v6 22; GFX12-NEXT: v_mov_b32_e32 v8, v6 23; GFX12-NEXT: v_mov_b32_e32 v9, v6 24; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] 25; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 26; GFX12-NEXT: s_endpgm 27bb: 28 %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>) 29 store <4 x float> %res, ptr addrspace(1) %out 30 ret void 31} 32 33define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { 34; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: 35; GFX12: ; %bb.0: ; %bb 36; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 37; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 38; GFX12-NEXT: s_endpgm 39bb: 40 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>) 41 store <4 x float> %res, ptr addrspace(1) %out 42 ret void 43} 44 45define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { 46; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: 47; GFX12: ; %bb.0: ; %bb 48; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 49; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 50; GFX12-NEXT: v_mov_b32_e32 v7, v6 51; GFX12-NEXT: v_mov_b32_e32 v8, v6 52; GFX12-NEXT: v_mov_b32_e32 v9, v6 53; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] 54; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 55; GFX12-NEXT: s_endpgm 56bb: 57 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>) 58 store <4 x float> %res, ptr addrspace(1) %out 59 ret void 60} 61 62define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { 63; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: 64; GFX12: ; %bb.0: ; %bb 65; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 66; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off 67; GFX12-NEXT: s_endpgm 68bb: 69 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0) 70 store <4 x half> %res, ptr addrspace(1) %out 71 ret void 72} 73 74define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { 75; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: 76; GFX12: ; %bb.0: ; %bb 77; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200 78; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 79; GFX12-NEXT: v_mov_b32_e32 v7, v6 80; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] 81; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off 82; GFX12-NEXT: s_endpgm 83bb: 84 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0) 85 store <4 x half> %res, ptr addrspace(1) %out 86 ret void 87} 88 89define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { 90; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: 91; GFX12: ; %bb.0: ; %bb 92; GFX12-NEXT: v_mov_b32_e32 v6, 0x3f803f80 93; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 94; GFX12-NEXT: v_mov_b32_e32 v7, v6 95; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] 96; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off 97; GFX12-NEXT: s_endpgm 98bb: 99 %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0) 100 store <4 x i16> %res, ptr addrspace(1) %out 101 ret void 102} 103 104define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { 105; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: 106; GFX12: ; %bb.0: ; %bb 107; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0 108; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 109; GFX12-NEXT: v_mov_b32_e32 v7, v6 110; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] 111; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off 112; GFX12-NEXT: s_endpgm 113bb: 114 %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0) 115 store <4 x i16> %res, ptr addrspace(1) %out 116 ret void 117} 118 119define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { 120; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: 121; GFX12: ; %bb.0: ; %bb 122; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 123; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 124; GFX12-NEXT: s_endpgm 125bb: 126 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0) 127 store <4 x i32> %res, ptr addrspace(1) %out 128 ret void 129} 130 131define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { 132; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: 133; GFX12: ; %bb.0: ; %bb 134; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 135; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 136; GFX12-NEXT: v_mov_b32_e32 v5, v4 137; GFX12-NEXT: v_mov_b32_e32 v6, v4 138; GFX12-NEXT: v_mov_b32_e32 v7, v4 139; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] 140; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 141; GFX12-NEXT: s_endpgm 142bb: 143 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0) 144 store <4 x i32> %res, ptr addrspace(1) %out 145 ret void 146} 147 148define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { 149; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: 150; GFX12: ; %bb.0: ; %bb 151; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 152; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 153; GFX12-NEXT: s_endpgm 154bb: 155 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0) 156 store <4 x i32> %res, ptr addrspace(1) %out 157 ret void 158} 159 160define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { 161; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: 162; GFX12: ; %bb.0: ; %bb 163; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 164; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 165; GFX12-NEXT: v_mov_b32_e32 v5, v4 166; GFX12-NEXT: v_mov_b32_e32 v6, v4 167; GFX12-NEXT: v_mov_b32_e32 v7, v4 168; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] 169; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 170; GFX12-NEXT: s_endpgm 171bb: 172 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0) 173 store <4 x i32> %res, ptr addrspace(1) %out 174 ret void 175} 176 177define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 178; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: 179; GFX12: ; %bb.0: ; %bb 180; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 181; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 182; GFX12-NEXT: s_endpgm 183bb: 184 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>) 185 store <4 x float> %res, ptr addrspace(1) %out 186 ret void 187} 188 189define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 190; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: 191; GFX12: ; %bb.0: ; %bb 192; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 193; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 194; GFX12-NEXT: v_mov_b32_e32 v5, v4 195; GFX12-NEXT: v_mov_b32_e32 v6, v4 196; GFX12-NEXT: v_mov_b32_e32 v7, v4 197; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] 198; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 199; GFX12-NEXT: s_endpgm 200bb: 201 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>) 202 store <4 x float> %res, ptr addrspace(1) %out 203 ret void 204} 205 206define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 207; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: 208; GFX12: ; %bb.0: ; %bb 209; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 210; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 211; GFX12-NEXT: s_endpgm 212bb: 213 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>) 214 store <4 x float> %res, ptr addrspace(1) %out 215 ret void 216} 217 218define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 219; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: 220; GFX12: ; %bb.0: ; %bb 221; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 222; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 223; GFX12-NEXT: v_mov_b32_e32 v5, v4 224; GFX12-NEXT: v_mov_b32_e32 v6, v4 225; GFX12-NEXT: v_mov_b32_e32 v7, v4 226; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] 227; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 228; GFX12-NEXT: s_endpgm 229bb: 230 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>) 231 store <4 x float> %res, ptr addrspace(1) %out 232 ret void 233} 234 235define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 236; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: 237; GFX12: ; %bb.0: ; %bb 238; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 239; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 240; GFX12-NEXT: s_endpgm 241bb: 242 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>) 243 store <4 x float> %res, ptr addrspace(1) %out 244 ret void 245} 246 247define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 248; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: 249; GFX12: ; %bb.0: ; %bb 250; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 251; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 252; GFX12-NEXT: v_mov_b32_e32 v5, v4 253; GFX12-NEXT: v_mov_b32_e32 v6, v4 254; GFX12-NEXT: v_mov_b32_e32 v7, v4 255; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] 256; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 257; GFX12-NEXT: s_endpgm 258bb: 259 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>) 260 store <4 x float> %res, ptr addrspace(1) %out 261 ret void 262} 263 264define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 265; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: 266; GFX12: ; %bb.0: ; %bb 267; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 268; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 269; GFX12-NEXT: s_endpgm 270bb: 271 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>) 272 store <4 x float> %res, ptr addrspace(1) %out 273 ret void 274} 275 276define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 277; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: 278; GFX12: ; %bb.0: ; %bb 279; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 280; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 281; GFX12-NEXT: v_mov_b32_e32 v5, v4 282; GFX12-NEXT: v_mov_b32_e32 v6, v4 283; GFX12-NEXT: v_mov_b32_e32 v7, v4 284; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] 285; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 286; GFX12-NEXT: s_endpgm 287bb: 288 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>) 289 store <4 x float> %res, ptr addrspace(1) %out 290 ret void 291} 292 293define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 294; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: 295; GFX12: ; %bb.0: ; %bb 296; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 297; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 298; GFX12-NEXT: s_endpgm 299bb: 300 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0) 301 store <4 x i32> %res, ptr addrspace(1) %out 302 ret void 303} 304 305define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 306; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: 307; GFX12: ; %bb.0: ; %bb 308; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 309; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 310; GFX12-NEXT: v_mov_b32_e32 v5, v4 311; GFX12-NEXT: v_mov_b32_e32 v6, v4 312; GFX12-NEXT: v_mov_b32_e32 v7, v4 313; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] 314; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 315; GFX12-NEXT: s_endpgm 316bb: 317 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0) 318 store <4 x i32> %res, ptr addrspace(1) %out 319 ret void 320} 321 322declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half>, <4 x half>, <4 x float>) 323declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16>, <4 x i16>, <4 x float>) 324declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) 325declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) 326declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 327declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 328declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32, i32, <4 x float>) 329declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, <4 x float>) 330declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>) 331declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>) 332declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 333