xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
3
4define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
5; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
6; GFX12:       ; %bb.0: ; %bb
7; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
8; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
9; GFX12-NEXT:    s_endpgm
10bb:
11  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
12  store <4 x float> %res, ptr addrspace(1) %out
13  ret void
14}
15
16define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
17; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
18; GFX12:       ; %bb.0: ; %bb
19; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
20; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
21; GFX12-NEXT:    s_endpgm
22bb:
23  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
24  store <4 x float> %res, ptr addrspace(1) %out
25  ret void
26}
27
28define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
29; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
30; GFX12:       ; %bb.0: ; %bb
31; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
32; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
33; GFX12-NEXT:    s_endpgm
34bb:
35  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
36  store <4 x half> %res, ptr addrspace(1) %out
37  ret void
38}
39
40define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
41; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
42; GFX12:       ; %bb.0: ; %bb
43; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
44; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
45; GFX12-NEXT:    s_endpgm
46bb:
47  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
48  store <4 x i16> %res, ptr addrspace(1) %out
49  ret void
50}
51
52define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
53; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
54; GFX12:       ; %bb.0: ; %bb
55; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
56; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
57; GFX12-NEXT:    s_endpgm
58bb:
59  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
60  store <4 x i32> %res, ptr addrspace(1) %out
61  ret void
62}
63
64define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
65; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
66; GFX12:       ; %bb.0: ; %bb
67; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
68; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
69; GFX12-NEXT:    s_endpgm
70bb:
71  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
72  store <4 x i32> %res, ptr addrspace(1) %out
73  ret void
74}
75
76define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
77; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
78; GFX12:       ; %bb.0: ; %bb
79; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
80; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
81; GFX12-NEXT:    s_endpgm
82bb:
83  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
84  store <4 x float> %res, ptr addrspace(1) %out
85  ret void
86}
87
88define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
89; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
90; GFX12:       ; %bb.0: ; %bb
91; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
92; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
93; GFX12-NEXT:    s_endpgm
94bb:
95  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
96  store <4 x float> %res, ptr addrspace(1) %out
97  ret void
98}
99
100define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
101; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
102; GFX12:       ; %bb.0: ; %bb
103; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
104; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
105; GFX12-NEXT:    s_endpgm
106bb:
107  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
108  store <4 x float> %res, ptr addrspace(1) %out
109  ret void
110}
111
112define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
113; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
114; GFX12:       ; %bb.0: ; %bb
115; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
116; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
117; GFX12-NEXT:    s_endpgm
118bb:
119  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
120  store <4 x float> %res, ptr addrspace(1) %out
121  ret void
122}
123
124define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
125; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
126; GFX12:       ; %bb.0: ; %bb
127; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
128; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
129; GFX12-NEXT:    s_endpgm
130bb:
131  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
132  store <4 x i32> %res, ptr addrspace(1) %out
133  ret void
134}
135
136define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
137; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
138; GFX12:       ; %bb.0: ; %bb
139; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
140; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
141; GFX12-NEXT:    s_endpgm
142bb:
143  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
144  store <4 x float> %res, ptr addrspace(1) %out
145  ret void
146}
147
148define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
149; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
150; GFX12:       ; %bb.0: ; %bb
151; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
152; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
153; GFX12-NEXT:    s_endpgm
154bb:
155  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
156  store <4 x float> %res, ptr addrspace(1) %out
157  ret void
158}
159
160define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
161; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
162; GFX12:       ; %bb.0: ; %bb
163; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
164; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
165; GFX12-NEXT:    s_endpgm
166bb:
167  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
168  store <4 x half> %res, ptr addrspace(1) %out
169  ret void
170}
171
172define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
173; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
174; GFX12:       ; %bb.0: ; %bb
175; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
176; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
177; GFX12-NEXT:    s_endpgm
178bb:
179  %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
180  store <4 x i16> %res, ptr addrspace(1) %out
181  ret void
182}
183
184define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
185; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
186; GFX12:       ; %bb.0: ; %bb
187; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
188; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
189; GFX12-NEXT:    s_endpgm
190bb:
191  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
192  store <4 x i32> %res, ptr addrspace(1) %out
193  ret void
194}
195
196define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
197; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
198; GFX12:       ; %bb.0: ; %bb
199; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
200; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
201; GFX12-NEXT:    s_endpgm
202bb:
203  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
204  store <4 x i32> %res, ptr addrspace(1) %out
205  ret void
206}
207
208define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
209; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
210; GFX12:       ; %bb.0: ; %bb
211; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
212; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
213; GFX12-NEXT:    s_endpgm
214bb:
215  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
216  store <4 x i32> %res, ptr addrspace(1) %out
217  ret void
218}
219
220define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
221; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
222; GFX12:       ; %bb.0: ; %bb
223; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
224; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
225; GFX12-NEXT:    s_endpgm
226bb:
227  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
228  store <4 x float> %res, ptr addrspace(1) %out
229  ret void
230}
231
232define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
233; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
234; GFX12:       ; %bb.0: ; %bb
235; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
236; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
237; GFX12-NEXT:    s_endpgm
238bb:
239  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
240  store <4 x float> %res, ptr addrspace(1) %out
241  ret void
242}
243
244define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
245; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
246; GFX12:       ; %bb.0: ; %bb
247; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
248; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
249; GFX12-NEXT:    s_endpgm
250bb:
251  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
252  store <4 x float> %res, ptr addrspace(1) %out
253  ret void
254}
255
256define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
257; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
258; GFX12:       ; %bb.0: ; %bb
259; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
260; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
261; GFX12-NEXT:    s_endpgm
262bb:
263  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
264  store <4 x float> %res, ptr addrspace(1) %out
265  ret void
266}
267
268declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
269declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
270declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
271declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
272declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
273declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
274declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
275declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
276declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
277declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
278declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
279declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
280declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
281declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
282declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
283declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
284declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
285declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
286declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
287declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
288declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
289declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
290