xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
3
4define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
5; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
6; GFX12:       ; %bb.0: ; %bb
7; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
8; GFX12-NEXT:    s_clause 0x1
9; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
10; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
11; GFX12-NEXT:    s_endpgm
12bb:
13  %fneg.A = fneg <8 x half> %A
14  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
15  store <8 x float> %res, ptr addrspace(1) %out
16  ret void
17}
18
19define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
20; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
21; GFX12:       ; %bb.0: ; %bb
22; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
23; GFX12-NEXT:    s_clause 0x1
24; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
25; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
26; GFX12-NEXT:    s_endpgm
27bb:
28  %fneg.B = fneg <8 x half> %B
29  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
30  store <8 x float> %res, ptr addrspace(1) %out
31  ret void
32}
33
34define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
35; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
36; GFX12:       ; %bb.0: ; %bb
37; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
38; GFX12-NEXT:    s_clause 0x1
39; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
40; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
41; GFX12-NEXT:    s_endpgm
42bb:
43  %fneg.C = fneg <8 x float> %C
44  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
45  store <8 x float> %res, ptr addrspace(1) %out
46  ret void
47}
48
49define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
50; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
51; GFX12:       ; %bb.0: ; %bb
52; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
53; GFX12-NEXT:    s_clause 0x1
54; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
55; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
56; GFX12-NEXT:    s_endpgm
57bb:
58  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
59  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
60  store <8 x float> %res, ptr addrspace(1) %out
61  ret void
62}
63
64define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
65; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
66; GFX12:       ; %bb.0: ; %bb
67; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
68; GFX12-NEXT:    s_clause 0x1
69; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
70; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
71; GFX12-NEXT:    s_endpgm
72bb:
73  %fneg.C = fneg <8 x float> %C
74  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
75  store <8 x float> %res, ptr addrspace(1) %out
76  ret void
77}
78
79define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
80; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
81; GFX12:       ; %bb.0: ; %bb
82; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
83; GFX12-NEXT:    s_clause 0x1
84; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
85; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
86; GFX12-NEXT:    s_endpgm
87bb:
88  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
89  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
90  store <8 x float> %res, ptr addrspace(1) %out
91  ret void
92}
93
94define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
95; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
96; GFX12:       ; %bb.0: ; %bb
97; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
98; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
99; GFX12-NEXT:    s_endpgm
100bb:
101  %fneg.A = fneg <8 x half> %A
102  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
103  store <8 x half> %res, ptr addrspace(1) %out
104  ret void
105}
106
107define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
108; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
109; GFX12:       ; %bb.0: ; %bb
110; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
111; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
112; GFX12-NEXT:    s_endpgm
113bb:
114  %fneg.B = fneg <8 x half> %B
115  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
116  store <8 x half> %res, ptr addrspace(1) %out
117  ret void
118}
119
120define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
121; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
122; GFX12:       ; %bb.0: ; %bb
123; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
124; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
125; GFX12-NEXT:    s_endpgm
126bb:
127  %fneg.C = fneg <8 x half> %C
128  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
129  store <8 x half> %res, ptr addrspace(1) %out
130  ret void
131}
132
133define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
134; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
135; GFX12:       ; %bb.0: ; %bb
136; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
137; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
138; GFX12-NEXT:    s_endpgm
139bb:
140  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
141  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
142  store <8 x half> %res, ptr addrspace(1) %out
143  ret void
144}
145
146define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
147; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
148; GFX12:       ; %bb.0: ; %bb
149; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
150; GFX12-NEXT:    s_clause 0x1
151; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
152; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
153; GFX12-NEXT:    s_endpgm
154bb:
155  %fneg.C = fneg <8 x float> %C
156  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
157  store <8 x float> %res, ptr addrspace(1) %out
158  ret void
159}
160
161define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
162; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
163; GFX12:       ; %bb.0: ; %bb
164; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
165; GFX12-NEXT:    s_clause 0x1
166; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
167; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
168; GFX12-NEXT:    s_endpgm
169bb:
170  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
171  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
172  store <8 x float> %res, ptr addrspace(1) %out
173  ret void
174}
175
176define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
177; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
178; GFX12:       ; %bb.0: ; %bb
179; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
180; GFX12-NEXT:    s_clause 0x1
181; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
182; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
183; GFX12-NEXT:    s_endpgm
184bb:
185  %fneg.C = fneg <8 x float> %C
186  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
187  store <8 x float> %res, ptr addrspace(1) %out
188  ret void
189}
190
191define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
192; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
193; GFX12:       ; %bb.0: ; %bb
194; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
195; GFX12-NEXT:    s_clause 0x1
196; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
197; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
198; GFX12-NEXT:    s_endpgm
199bb:
200  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
201  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
202  store <8 x float> %res, ptr addrspace(1) %out
203  ret void
204}
205
206define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
207; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
208; GFX12:       ; %bb.0: ; %bb
209; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
210; GFX12-NEXT:    s_clause 0x1
211; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
212; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
213; GFX12-NEXT:    s_endpgm
214bb:
215  %fneg.C = fneg <8 x float> %C
216  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
217  store <8 x float> %res, ptr addrspace(1) %out
218  ret void
219}
220
221define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
222; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
223; GFX12:       ; %bb.0: ; %bb
224; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
225; GFX12-NEXT:    s_clause 0x1
226; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
227; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
228; GFX12-NEXT:    s_endpgm
229bb:
230  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
231  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
232  store <8 x float> %res, ptr addrspace(1) %out
233  ret void
234}
235
236define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
237; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
238; GFX12:       ; %bb.0: ; %bb
239; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
240; GFX12-NEXT:    s_clause 0x1
241; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
242; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
243; GFX12-NEXT:    s_endpgm
244bb:
245  %fneg.C = fneg <8 x float> %C
246  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
247  store <8 x float> %res, ptr addrspace(1) %out
248  ret void
249}
250
251define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
252; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
253; GFX12:       ; %bb.0: ; %bb
254; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
255; GFX12-NEXT:    s_clause 0x1
256; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
257; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
258; GFX12-NEXT:    s_endpgm
259bb:
260  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
261  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
262  store <8 x float> %res, ptr addrspace(1) %out
263  ret void
264}
265
266define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
267; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
268; GFX12:       ; %bb.0: ; %bb
269; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
270; GFX12-NEXT:    s_clause 0x1
271; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
272; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
273; GFX12-NEXT:    s_endpgm
274bb:
275  %fneg.A = fneg <8 x half> %A
276  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
277  store <8 x float> %res, ptr addrspace(1) %out
278  ret void
279}
280
281define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
282; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
283; GFX12:       ; %bb.0: ; %bb
284; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
285; GFX12-NEXT:    s_clause 0x1
286; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
287; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
288; GFX12-NEXT:    s_endpgm
289bb:
290  %fneg.B = fneg <16 x half> %B
291  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
292  store <8 x float> %res, ptr addrspace(1) %out
293  ret void
294}
295
296define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
297; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
298; GFX12:       ; %bb.0: ; %bb
299; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
300; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
301; GFX12-NEXT:    s_endpgm
302bb:
303  %fneg.A = fneg <8 x half> %A
304  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
305  store <8 x half> %res, ptr addrspace(1) %out
306  ret void
307}
308
309define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
310; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
311; GFX12:       ; %bb.0: ; %bb
312; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
313; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
314; GFX12-NEXT:    s_endpgm
315bb:
316  %fneg.B = fneg <16 x half> %B
317  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
318  store <8 x half> %res, ptr addrspace(1) %out
319  ret void
320}
321
322; both neg and abs patterns (wmma matrix C f32 or f16 )
323
324define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
325; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
326; GFX12:       ; %bb.0: ; %bb
327; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
328; GFX12-NEXT:    s_clause 0x1
329; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
330; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
331; GFX12-NEXT:    s_endpgm
332bb:
333  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
334  %fneg.fabs.C = fneg <8 x float> %fabs.C
335  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
336  store <8 x float> %res, ptr addrspace(1) %out
337  ret void
338}
339
340define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
341; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
342; GFX12:       ; %bb.0: ; %bb
343; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
344; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
345; GFX12-NEXT:    s_endpgm
346bb:
347  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
348  %fneg.fabs.C = fneg <8 x half> %fabs.C
349  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
350  store <8 x half> %res, ptr addrspace(1) %out
351  ret void
352}
353
354define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
355; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
356; GFX12:       ; %bb.0: ; %bb
357; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
358; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
359; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
360; GFX12-NEXT:    s_clause 0x1
361; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
362; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
363; GFX12-NEXT:    s_endpgm
364bb:
365  %el3 = extractelement <8 x float> %C, i32 3
366  %el3.fabs = call float @llvm.fabs.f32(float %el3)
367  %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
368  %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
369  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
370  store <8 x float> %res, ptr addrspace(1) %out
371  ret void
372}
373
374; A or B matrix modifier and constant in C
375
376define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
377; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
378; GFX12:       ; %bb.0: ; %bb
379; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
380; GFX12-NEXT:    s_clause 0x1
381; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
382; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
383; GFX12-NEXT:    s_endpgm
384bb:
385  %fneg.A = fneg <8 x half> %A
386  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
387  store <8 x float> %res, ptr addrspace(1) %out
388  ret void
389}
390
391define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
392; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
393; GFX12:       ; %bb.0: ; %bb
394; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
395; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
396; GFX12-NEXT:    s_endpgm
397bb:
398  %fneg.B = fneg <8 x half> %B
399  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
400  store <8 x half> %res, ptr addrspace(1) %out
401  ret void
402}
403
404; pack f16 elements with v_perm_b32 since they don't come from same b32
405
406define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
407; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
408; GFX12:       ; %bb.0: ; %bb
409; GFX12-NEXT:    s_clause 0x1
410; GFX12-NEXT:    flat_load_b128 v[12:15], v[8:9]
411; GFX12-NEXT:    flat_load_b128 v[16:19], v[8:9] offset:16
412; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x101
413; GFX12-NEXT:    v_and_b32_e32 v8, 0xffff, v12
414; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff, v14
415; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
416; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v16
417; GFX12-NEXT:    v_and_b32_e32 v16, 0xffff, v18
418; GFX12-NEXT:    v_lshl_or_b32 v12, v13, 16, v8
419; GFX12-NEXT:    v_lshl_or_b32 v13, v15, 16, v9
420; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
421; GFX12-NEXT:    v_lshl_or_b32 v14, v17, 16, v14
422; GFX12-NEXT:    v_lshl_or_b32 v15, v19, 16, v16
423; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
424; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
425; GFX12-NEXT:    global_store_b128 v[10:11], v[12:15], off
426; GFX12-NEXT:    s_endpgm
427bb:
428  %C = load <16 x half>, ptr %Caddr
429  %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
430  %fneg.C_shuffle = fneg <8 x half> %C_shuffle
431  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
432  store <8 x half> %res, ptr addrspace(1) %out
433  ret void
434}
435
436declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
437declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
438declare float @llvm.fabs.f32(float)
439
440declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
441declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
442declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
443declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
444declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
445declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
446declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
447declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
448declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
449