xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (revision 66e0498dafbfa7f8fd7deaa88ae62bdf38a12113)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
4
5declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
6declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
7
8; --------------------------------------------------------------------
9; llvm.amdgcn.mfma.f32.16x16x32.f16
10; --------------------------------------------------------------------
11
12define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
13; GCN-LABEL: test_mfma_f32_16x16x32_f16:
14; GCN:       ; %bb.0:
15; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
17; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
18; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
19; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
20; GCN-NEXT:    s_nop 1
21; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
22; GCN-NEXT:    s_nop 6
23; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
24; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
25; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
26; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
27; GCN-NEXT:    s_setpc_b64 s[30:31]
28  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
29  ret <4 x float> %result
30}
31
32define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
33; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags:
34; GCN:       ; %bb.0:
35; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
37; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
38; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
39; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
40; GCN-NEXT:    s_nop 1
41; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
42; GCN-NEXT:    s_nop 6
43; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
44; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
45; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
46; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
47; GCN-NEXT:    s_setpc_b64 s[30:31]
48  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
49  ret <4 x float> %result
50}
51
52define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
53; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
54; SDAG:       ; %bb.0:
55; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
56; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
57; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
58; SDAG-NEXT:    v_mov_b32_e32 v8, 0
59; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
60; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
61; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
62; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
63; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
64; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
65; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
66; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
67; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
68; SDAG-NEXT:    s_nop 1
69; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
70; SDAG-NEXT:    s_nop 6
71; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
72; SDAG-NEXT:    s_endpgm
73;
74; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
75; GISEL:       ; %bb.0:
76; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
77; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
78; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
79; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
80; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
81; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
82; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
83; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
84; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
85; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
86; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
87; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
88; GISEL-NEXT:    s_nop 1
89; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
90; GISEL-NEXT:    v_mov_b32_e32 v0, 0
91; GISEL-NEXT:    s_nop 5
92; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
93; GISEL-NEXT:    s_endpgm
94  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
95  store <4 x float> %result, ptr addrspace(1) %out
96  ret void
97}
98
99define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
100; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
101; SDAG:       ; %bb.0:
102; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
103; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
104; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
105; SDAG-NEXT:    v_mov_b32_e32 v8, 0
106; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
107; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
108; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
109; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
110; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
111; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
112; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
113; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
114; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
115; SDAG-NEXT:    s_nop 1
116; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
117; SDAG-NEXT:    s_nop 6
118; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
119; SDAG-NEXT:    s_endpgm
120;
121; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
122; GISEL:       ; %bb.0:
123; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
124; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
125; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
126; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
127; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
128; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
129; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
130; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
131; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
132; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
133; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
134; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
135; GISEL-NEXT:    s_nop 1
136; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
137; GISEL-NEXT:    v_mov_b32_e32 v0, 0
138; GISEL-NEXT:    s_nop 5
139; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
140; GISEL-NEXT:    s_endpgm
141  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
142  store <4 x float> %result, ptr addrspace(1) %out
143  ret void
144}
145
146; --------------------------------------------------------------------
147; llvm.amdgcn.mfma.f32.32x32x16.f16
148; --------------------------------------------------------------------
149
150define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 {
151; SDAG-LABEL: test_mfma_f32_32x32x16_f16:
152; SDAG:       ; %bb.0:
153; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
154; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
155; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 48
156; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 32
157; SDAG-NEXT:    v_mov_b64_e32 v[16:17], 16
158; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
159; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
160; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
161; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
162; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
163; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
164; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
165; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
166; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
167; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
168; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
169; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
170; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
171; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
172; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
173; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
174; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
175; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
176; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
177; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
178; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
179; SDAG-NEXT:    v_mov_b64_e32 v[18:19], 0
180; SDAG-NEXT:    v_mov_b32_e32 v8, s16
181; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
182; SDAG-NEXT:    v_mov_b32_e32 v0, s20
183; SDAG-NEXT:    v_mov_b32_e32 v1, s21
184; SDAG-NEXT:    v_mov_b32_e32 v2, s22
185; SDAG-NEXT:    v_mov_b32_e32 v3, s23
186; SDAG-NEXT:    v_mov_b32_e32 v9, s17
187; SDAG-NEXT:    v_mov_b32_e32 v10, s18
188; SDAG-NEXT:    v_mov_b32_e32 v11, s19
189; SDAG-NEXT:    s_nop 3
190; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
191; SDAG-NEXT:    s_waitcnt vmcnt(0)
192; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
193; SDAG-NEXT:    s_waitcnt vmcnt(0)
194; SDAG-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
195; SDAG-NEXT:    s_waitcnt vmcnt(0)
196; SDAG-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
197; SDAG-NEXT:    s_waitcnt vmcnt(0)
198; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
199; SDAG-NEXT:    s_waitcnt vmcnt(0)
200; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
201; SDAG-NEXT:    s_waitcnt vmcnt(0)
202; SDAG-NEXT:    s_nop 0
203; SDAG-NEXT:    v_mov_b32_e32 v0, s8
204; SDAG-NEXT:    v_mov_b32_e32 v1, s9
205; SDAG-NEXT:    v_mov_b32_e32 v2, s10
206; SDAG-NEXT:    v_mov_b32_e32 v3, s11
207; SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
208; SDAG-NEXT:    s_waitcnt vmcnt(0)
209; SDAG-NEXT:    s_nop 0
210; SDAG-NEXT:    v_mov_b32_e32 v0, s12
211; SDAG-NEXT:    v_mov_b32_e32 v1, s13
212; SDAG-NEXT:    v_mov_b32_e32 v2, s14
213; SDAG-NEXT:    v_mov_b32_e32 v3, s15
214; SDAG-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
215; SDAG-NEXT:    s_waitcnt vmcnt(0)
216; SDAG-NEXT:    s_endpgm
217;
218; GISEL-LABEL: test_mfma_f32_32x32x16_f16:
219; GISEL:       ; %bb.0:
220; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
221; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
222; GISEL-NEXT:    v_mov_b64_e32 v[20:21], 0
223; GISEL-NEXT:    v_mov_b64_e32 v[26:27], 48
224; GISEL-NEXT:    v_mov_b64_e32 v[22:23], 16
225; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
226; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
227; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
228; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
229; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
230; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
231; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
232; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
233; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
234; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
235; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
236; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
237; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
238; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
239; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
240; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
241; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
242; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
243; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
244; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
245; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
246; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
247; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
248; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
249; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
250; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
251; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[22:23]
252; GISEL-NEXT:    v_mov_b64_e32 v[24:25], 32
253; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
254; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
255; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
256; GISEL-NEXT:    s_nop 3
257; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
258; GISEL-NEXT:    s_waitcnt vmcnt(0)
259; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
260; GISEL-NEXT:    s_waitcnt vmcnt(0)
261; GISEL-NEXT:    global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
262; GISEL-NEXT:    s_waitcnt vmcnt(0)
263; GISEL-NEXT:    global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
264; GISEL-NEXT:    s_waitcnt vmcnt(0)
265; GISEL-NEXT:    global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
266; GISEL-NEXT:    s_waitcnt vmcnt(0)
267; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
268; GISEL-NEXT:    s_waitcnt vmcnt(0)
269; GISEL-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
270; GISEL-NEXT:    s_waitcnt vmcnt(0)
271; GISEL-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
272; GISEL-NEXT:    s_waitcnt vmcnt(0)
273; GISEL-NEXT:    s_endpgm
274  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
275  store volatile <16 x float> %result, ptr addrspace(1) null
276  store volatile <16 x float> %arg2, ptr addrspace(1) null
277  ret void
278}
279
280define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 {
281; SDAG-LABEL: test_mfma_f32_32x32x16_f16__flags:
282; SDAG:       ; %bb.0:
283; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
284; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
285; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 48
286; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 32
287; SDAG-NEXT:    v_mov_b64_e32 v[16:17], 16
288; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
289; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
290; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
291; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
292; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
293; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
294; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
295; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
296; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
297; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
298; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
299; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
300; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
301; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
302; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
303; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
304; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
305; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
306; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
307; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
308; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
309; SDAG-NEXT:    v_mov_b64_e32 v[18:19], 0
310; SDAG-NEXT:    v_mov_b32_e32 v8, s16
311; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
312; SDAG-NEXT:    v_mov_b32_e32 v0, s20
313; SDAG-NEXT:    v_mov_b32_e32 v1, s21
314; SDAG-NEXT:    v_mov_b32_e32 v2, s22
315; SDAG-NEXT:    v_mov_b32_e32 v3, s23
316; SDAG-NEXT:    v_mov_b32_e32 v9, s17
317; SDAG-NEXT:    v_mov_b32_e32 v10, s18
318; SDAG-NEXT:    v_mov_b32_e32 v11, s19
319; SDAG-NEXT:    s_nop 3
320; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
321; SDAG-NEXT:    s_waitcnt vmcnt(0)
322; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
323; SDAG-NEXT:    s_waitcnt vmcnt(0)
324; SDAG-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
325; SDAG-NEXT:    s_waitcnt vmcnt(0)
326; SDAG-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
327; SDAG-NEXT:    s_waitcnt vmcnt(0)
328; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
329; SDAG-NEXT:    s_waitcnt vmcnt(0)
330; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
331; SDAG-NEXT:    s_waitcnt vmcnt(0)
332; SDAG-NEXT:    s_nop 0
333; SDAG-NEXT:    v_mov_b32_e32 v0, s8
334; SDAG-NEXT:    v_mov_b32_e32 v1, s9
335; SDAG-NEXT:    v_mov_b32_e32 v2, s10
336; SDAG-NEXT:    v_mov_b32_e32 v3, s11
337; SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
338; SDAG-NEXT:    s_waitcnt vmcnt(0)
339; SDAG-NEXT:    s_nop 0
340; SDAG-NEXT:    v_mov_b32_e32 v0, s12
341; SDAG-NEXT:    v_mov_b32_e32 v1, s13
342; SDAG-NEXT:    v_mov_b32_e32 v2, s14
343; SDAG-NEXT:    v_mov_b32_e32 v3, s15
344; SDAG-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
345; SDAG-NEXT:    s_waitcnt vmcnt(0)
346; SDAG-NEXT:    s_endpgm
347;
348; GISEL-LABEL: test_mfma_f32_32x32x16_f16__flags:
349; GISEL:       ; %bb.0:
350; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
351; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
352; GISEL-NEXT:    v_mov_b64_e32 v[20:21], 0
353; GISEL-NEXT:    v_mov_b64_e32 v[26:27], 48
354; GISEL-NEXT:    v_mov_b64_e32 v[22:23], 16
355; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
356; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
357; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
358; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
359; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
360; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
361; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
362; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
363; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
364; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
365; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
366; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
367; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
368; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
369; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
370; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
371; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
372; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
373; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
374; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
375; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
376; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
377; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
378; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
379; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
380; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
381; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[22:23]
382; GISEL-NEXT:    v_mov_b64_e32 v[24:25], 32
383; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
384; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
385; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
386; GISEL-NEXT:    s_nop 3
387; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
388; GISEL-NEXT:    s_waitcnt vmcnt(0)
389; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
390; GISEL-NEXT:    s_waitcnt vmcnt(0)
391; GISEL-NEXT:    global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
392; GISEL-NEXT:    s_waitcnt vmcnt(0)
393; GISEL-NEXT:    global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
394; GISEL-NEXT:    s_waitcnt vmcnt(0)
395; GISEL-NEXT:    global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
396; GISEL-NEXT:    s_waitcnt vmcnt(0)
397; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
398; GISEL-NEXT:    s_waitcnt vmcnt(0)
399; GISEL-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
400; GISEL-NEXT:    s_waitcnt vmcnt(0)
401; GISEL-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
402; GISEL-NEXT:    s_waitcnt vmcnt(0)
403; GISEL-NEXT:    s_endpgm
404  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
405  store volatile <16 x float> %result, ptr addrspace(1) null
406  store volatile <16 x float> %arg2, ptr addrspace(1) null
407  ret void
408}
409
410define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
411; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac:
412; GCN:       ; %bb.0:
413; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
415; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
416; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
417; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
418; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
419; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
420; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
421; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
422; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
423; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
424; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
425; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
426; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
427; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
428; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
429; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
430; GCN-NEXT:    s_nop 1
431; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
432; GCN-NEXT:    s_nop 7
433; GCN-NEXT:    s_nop 2
434; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
435; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
436; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
437; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
438; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
439; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
440; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
441; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
442; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
443; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
444; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
445; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
446; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
447; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
448; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
449; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
450; GCN-NEXT:    s_setpc_b64 s[30:31]
451  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
452  ret <16 x float> %result
453}
454
455define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
456; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
457; GCN:       ; %bb.0:
458; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
459; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
460; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
461; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
462; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
463; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
464; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
465; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
466; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
467; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
468; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
469; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
470; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
471; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
472; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
473; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
474; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
475; GCN-NEXT:    s_nop 1
476; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
477; GCN-NEXT:    s_nop 7
478; GCN-NEXT:    s_nop 2
479; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
480; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
481; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
482; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
483; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
484; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
485; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
486; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
487; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
488; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
489; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
490; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
491; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
492; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
493; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
494; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
495; GCN-NEXT:    s_setpc_b64 s[30:31]
496  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
497  ret <16 x float> %result
498}
499
500define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
501; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
502; SDAG:       ; %bb.0:
503; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
504; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
505; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
506; SDAG-NEXT:    v_mov_b32_e32 v12, 0
507; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
508; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
509; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
510; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
511; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
512; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
513; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
514; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
515; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
516; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
517; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
518; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
519; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
520; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
521; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
522; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
523; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
524; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
525; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
526; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
527; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
528; SDAG-NEXT:    v_mov_b32_e32 v8, s20
529; SDAG-NEXT:    v_mov_b32_e32 v9, s21
530; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
531; SDAG-NEXT:    v_mov_b32_e32 v10, s22
532; SDAG-NEXT:    v_mov_b32_e32 v11, s23
533; SDAG-NEXT:    v_mov_b32_e32 v0, s16
534; SDAG-NEXT:    v_mov_b32_e32 v1, s17
535; SDAG-NEXT:    v_mov_b32_e32 v2, s18
536; SDAG-NEXT:    v_mov_b32_e32 v3, s19
537; SDAG-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
538; SDAG-NEXT:    s_waitcnt vmcnt(0)
539; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
540; SDAG-NEXT:    s_waitcnt vmcnt(0)
541; SDAG-NEXT:    s_nop 0
542; SDAG-NEXT:    v_mov_b32_e32 v0, s12
543; SDAG-NEXT:    v_mov_b32_e32 v1, s13
544; SDAG-NEXT:    v_mov_b32_e32 v2, s14
545; SDAG-NEXT:    v_mov_b32_e32 v3, s15
546; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
547; SDAG-NEXT:    s_waitcnt vmcnt(0)
548; SDAG-NEXT:    s_nop 0
549; SDAG-NEXT:    v_mov_b32_e32 v0, s8
550; SDAG-NEXT:    v_mov_b32_e32 v1, s9
551; SDAG-NEXT:    v_mov_b32_e32 v2, s10
552; SDAG-NEXT:    v_mov_b32_e32 v3, s11
553; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
554; SDAG-NEXT:    s_waitcnt vmcnt(0)
555; SDAG-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
556; SDAG-NEXT:    s_waitcnt vmcnt(0)
557; SDAG-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
558; SDAG-NEXT:    s_waitcnt vmcnt(0)
559; SDAG-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
560; SDAG-NEXT:    s_waitcnt vmcnt(0)
561; SDAG-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
562; SDAG-NEXT:    s_waitcnt vmcnt(0)
563; SDAG-NEXT:    s_endpgm
564;
565; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
566; GISEL:       ; %bb.0:
567; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
568; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
569; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
570; GISEL-NEXT:    v_mov_b32_e32 v24, 0
571; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
572; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
573; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
574; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
575; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
576; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
577; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
578; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
579; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
580; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
581; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
582; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
583; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
584; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
585; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
586; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
587; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
588; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
589; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
590; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
591; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
592; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
593; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
594; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
595; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
596; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
597; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
598; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
599; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
600; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
601; GISEL-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
602; GISEL-NEXT:    s_waitcnt vmcnt(0)
603; GISEL-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
604; GISEL-NEXT:    s_waitcnt vmcnt(0)
605; GISEL-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
606; GISEL-NEXT:    s_waitcnt vmcnt(0)
607; GISEL-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
608; GISEL-NEXT:    s_waitcnt vmcnt(0)
609; GISEL-NEXT:    global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
610; GISEL-NEXT:    s_waitcnt vmcnt(0)
611; GISEL-NEXT:    global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
612; GISEL-NEXT:    s_waitcnt vmcnt(0)
613; GISEL-NEXT:    global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
614; GISEL-NEXT:    s_waitcnt vmcnt(0)
615; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
616; GISEL-NEXT:    s_waitcnt vmcnt(0)
617; GISEL-NEXT:    s_endpgm
618  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
619  store volatile <16 x float> %arg2, ptr addrspace(1) %out
620  store volatile <16 x float> %result, ptr addrspace(1) %out
621  ret void
622}
623
624define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
625; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
626; SDAG:       ; %bb.0:
627; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
628; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
629; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
630; SDAG-NEXT:    v_mov_b32_e32 v12, 0
631; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
632; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
633; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
634; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
635; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
636; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
637; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
638; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
639; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
640; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
641; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
642; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
643; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
644; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
645; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
646; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
647; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
648; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
649; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
650; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
651; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
652; SDAG-NEXT:    v_mov_b32_e32 v8, s20
653; SDAG-NEXT:    v_mov_b32_e32 v9, s21
654; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
655; SDAG-NEXT:    v_mov_b32_e32 v10, s22
656; SDAG-NEXT:    v_mov_b32_e32 v11, s23
657; SDAG-NEXT:    v_mov_b32_e32 v0, s16
658; SDAG-NEXT:    v_mov_b32_e32 v1, s17
659; SDAG-NEXT:    v_mov_b32_e32 v2, s18
660; SDAG-NEXT:    v_mov_b32_e32 v3, s19
661; SDAG-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
662; SDAG-NEXT:    s_waitcnt vmcnt(0)
663; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
664; SDAG-NEXT:    s_waitcnt vmcnt(0)
665; SDAG-NEXT:    s_nop 0
666; SDAG-NEXT:    v_mov_b32_e32 v0, s12
667; SDAG-NEXT:    v_mov_b32_e32 v1, s13
668; SDAG-NEXT:    v_mov_b32_e32 v2, s14
669; SDAG-NEXT:    v_mov_b32_e32 v3, s15
670; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
671; SDAG-NEXT:    s_waitcnt vmcnt(0)
672; SDAG-NEXT:    s_nop 0
673; SDAG-NEXT:    v_mov_b32_e32 v0, s8
674; SDAG-NEXT:    v_mov_b32_e32 v1, s9
675; SDAG-NEXT:    v_mov_b32_e32 v2, s10
676; SDAG-NEXT:    v_mov_b32_e32 v3, s11
677; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
678; SDAG-NEXT:    s_waitcnt vmcnt(0)
679; SDAG-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
680; SDAG-NEXT:    s_waitcnt vmcnt(0)
681; SDAG-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
682; SDAG-NEXT:    s_waitcnt vmcnt(0)
683; SDAG-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
684; SDAG-NEXT:    s_waitcnt vmcnt(0)
685; SDAG-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
686; SDAG-NEXT:    s_waitcnt vmcnt(0)
687; SDAG-NEXT:    s_endpgm
688;
689; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
690; GISEL:       ; %bb.0:
691; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
692; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
693; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
694; GISEL-NEXT:    v_mov_b32_e32 v24, 0
695; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
696; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
697; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
698; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
699; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
700; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
701; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
702; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
703; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
704; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
705; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
706; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
707; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
708; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
709; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
710; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
711; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
712; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
713; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
714; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
715; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
716; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
717; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
718; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
719; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
720; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
721; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
722; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
723; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
724; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
725; GISEL-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
726; GISEL-NEXT:    s_waitcnt vmcnt(0)
727; GISEL-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
728; GISEL-NEXT:    s_waitcnt vmcnt(0)
729; GISEL-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
730; GISEL-NEXT:    s_waitcnt vmcnt(0)
731; GISEL-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
732; GISEL-NEXT:    s_waitcnt vmcnt(0)
733; GISEL-NEXT:    global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
734; GISEL-NEXT:    s_waitcnt vmcnt(0)
735; GISEL-NEXT:    global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
736; GISEL-NEXT:    s_waitcnt vmcnt(0)
737; GISEL-NEXT:    global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
738; GISEL-NEXT:    s_waitcnt vmcnt(0)
739; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
740; GISEL-NEXT:    s_waitcnt vmcnt(0)
741; GISEL-NEXT:    s_endpgm
742  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
743  store volatile <16 x float> %arg2, ptr addrspace(1) %out
744  store volatile <16 x float> %result, ptr addrspace(1) %out
745  ret void
746}
747
748define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
749; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
750; SDAG:       ; %bb.0:
751; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
752; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
753; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
754; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
755; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
756; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
757; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
758; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
759; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
760; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
761; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
762; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
763; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
764; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
765; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
766; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
767; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
768; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
769; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
770; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
771; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
772; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
773; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
774; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
775; SDAG-NEXT:    s_nop 1
776; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
777; SDAG-NEXT:    v_mov_b32_e32 v0, 0
778; SDAG-NEXT:    s_nop 7
779; SDAG-NEXT:    s_nop 1
780; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
781; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
782; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
783; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
784; SDAG-NEXT:    s_endpgm
785;
786; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
787; GISEL:       ; %bb.0:
788; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
789; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
790; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
791; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
792; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
793; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
794; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
795; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
796; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
797; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
798; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
799; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
800; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
801; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
802; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
803; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
804; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
805; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
806; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
807; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
808; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
809; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
810; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
811; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
812; GISEL-NEXT:    s_nop 1
813; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
814; GISEL-NEXT:    v_mov_b32_e32 v0, 0
815; GISEL-NEXT:    s_nop 7
816; GISEL-NEXT:    s_nop 1
817; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
818; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
819; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
820; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
821; GISEL-NEXT:    s_endpgm
822  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
823  store <16 x float> %result, ptr addrspace(1) %out
824  ret void
825}
826
827define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
828; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
829; SDAG:       ; %bb.0:
830; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
831; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
832; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
833; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
834; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
835; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
836; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
837; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
838; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
839; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
840; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
841; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
842; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
843; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
844; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
845; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
846; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
847; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
848; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
849; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
850; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
851; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
852; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
853; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
854; SDAG-NEXT:    s_nop 1
855; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
856; SDAG-NEXT:    v_mov_b32_e32 v0, 0
857; SDAG-NEXT:    s_nop 7
858; SDAG-NEXT:    s_nop 1
859; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
860; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
861; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
862; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
863; SDAG-NEXT:    s_endpgm
864;
865; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
866; GISEL:       ; %bb.0:
867; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
868; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
869; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
870; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
871; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
872; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
873; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
874; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
875; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
876; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
877; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
878; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
879; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
880; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
881; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
882; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
883; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
884; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
885; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
886; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
887; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
888; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
889; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
890; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
891; GISEL-NEXT:    s_nop 1
892; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
893; GISEL-NEXT:    v_mov_b32_e32 v0, 0
894; GISEL-NEXT:    s_nop 7
895; GISEL-NEXT:    s_nop 1
896; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
897; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
898; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
899; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
900; GISEL-NEXT:    s_endpgm
901  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
902  store <16 x float> %result, ptr addrspace(1) %out
903  ret void
904}
905
906; --------------------------------------------------------------------
907; llvm.amdgcn.mfma.i32.16x16x64.i8
908; --------------------------------------------------------------------
909
910declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg)
911
912define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
913; GCN-LABEL: test_mfma_i32_16x16x64_i8:
914; GCN:       ; %bb.0:
915; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
916; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
917; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
918; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
919; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
920; GCN-NEXT:    s_nop 1
921; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
922; GCN-NEXT:    s_nop 6
923; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
924; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
925; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
926; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
927; GCN-NEXT:    s_setpc_b64 s[30:31]
928  %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
929  ret <4 x i32> %result
930}
931
932define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
933; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags:
934; GCN:       ; %bb.0:
935; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
936; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
937; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
938; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
939; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
940; GCN-NEXT:    s_nop 1
941; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
942; GCN-NEXT:    s_nop 6
943; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
944; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
945; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
946; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
947; GCN-NEXT:    s_setpc_b64 s[30:31]
948  %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 1, i32 1, i32 1)
949  ret <4 x i32> %result
950}
951
952define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspace(1) %out, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) #0 {
953; SDAG-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
954; SDAG:       ; %bb.0:
955; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
956; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
957; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
958; SDAG-NEXT:    v_mov_b32_e32 v8, 0
959; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
960; SDAG-NEXT:    v_mov_b32_e32 v0, s8
961; SDAG-NEXT:    v_mov_b32_e32 v1, s9
962; SDAG-NEXT:    v_mov_b32_e32 v2, s10
963; SDAG-NEXT:    v_mov_b32_e32 v3, s11
964; SDAG-NEXT:    v_mov_b32_e32 v4, s12
965; SDAG-NEXT:    v_mov_b32_e32 v5, s13
966; SDAG-NEXT:    v_mov_b32_e32 v6, s14
967; SDAG-NEXT:    v_mov_b32_e32 v7, s15
968; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
969; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
970; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
971; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
972; SDAG-NEXT:    s_nop 1
973; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
974; SDAG-NEXT:    s_nop 6
975; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
976; SDAG-NEXT:    s_endpgm
977;
978; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
979; GISEL:       ; %bb.0:
980; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
981; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
982; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
983; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
984; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
985; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
986; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
987; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
988; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
989; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
990; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
991; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
992; GISEL-NEXT:    s_nop 1
993; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
994; GISEL-NEXT:    v_mov_b32_e32 v0, 0
995; GISEL-NEXT:    s_nop 5
996; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
997; GISEL-NEXT:    s_endpgm
998  %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
999  store <4 x i32> %result, ptr addrspace(1) %out
1000  ret void
1001}
1002
1003define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) #0 {
1004; SDAG-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
1005; SDAG:       ; %bb.0:
1006; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
1007; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1008; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
1009; SDAG-NEXT:    v_mov_b32_e32 v8, 0
1010; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1011; SDAG-NEXT:    v_mov_b32_e32 v0, s8
1012; SDAG-NEXT:    v_mov_b32_e32 v1, s9
1013; SDAG-NEXT:    v_mov_b32_e32 v2, s10
1014; SDAG-NEXT:    v_mov_b32_e32 v3, s11
1015; SDAG-NEXT:    v_mov_b32_e32 v4, s12
1016; SDAG-NEXT:    v_mov_b32_e32 v5, s13
1017; SDAG-NEXT:    v_mov_b32_e32 v6, s14
1018; SDAG-NEXT:    v_mov_b32_e32 v7, s15
1019; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
1020; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
1021; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
1022; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
1023; SDAG-NEXT:    s_nop 1
1024; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
1025; SDAG-NEXT:    s_nop 6
1026; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
1027; SDAG-NEXT:    s_endpgm
1028;
1029; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
1030; GISEL:       ; %bb.0:
1031; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
1032; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
1033; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1034; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1035; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
1036; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
1037; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
1038; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
1039; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
1040; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
1041; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
1042; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
1043; GISEL-NEXT:    s_nop 1
1044; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
1045; GISEL-NEXT:    v_mov_b32_e32 v0, 0
1046; GISEL-NEXT:    s_nop 5
1047; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
1048; GISEL-NEXT:    s_endpgm
1049  %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1)
1050  store <4 x i32> %result, ptr addrspace(1) %out
1051  ret void
1052}
1053
1054; --------------------------------------------------------------------
1055; llvm.amdgcn.mfma.i32.32x32x32.i8
1056; --------------------------------------------------------------------
1057
1058declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32>, <4 x i32>, <16 x i32>, i32 immarg, i32 immarg, i32 immarg)
1059
1060define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) #1 {
1061; SDAG-LABEL: test_mfma_i32_32x32x32_i8:
1062; SDAG:       ; %bb.0:
1063; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
1064; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1065; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 48
1066; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 32
1067; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 16
1068; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1069; SDAG-NEXT:    v_mov_b32_e32 v0, s24
1070; SDAG-NEXT:    v_mov_b32_e32 v1, s25
1071; SDAG-NEXT:    v_mov_b32_e32 v2, s26
1072; SDAG-NEXT:    v_mov_b32_e32 v3, s27
1073; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
1074; SDAG-NEXT:    v_mov_b32_e32 v4, s28
1075; SDAG-NEXT:    v_mov_b32_e32 v5, s29
1076; SDAG-NEXT:    v_mov_b32_e32 v6, s30
1077; SDAG-NEXT:    v_mov_b32_e32 v7, s31
1078; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
1079; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
1080; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
1081; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
1082; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
1083; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
1084; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
1085; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
1086; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
1087; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
1088; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
1089; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
1090; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
1091; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
1092; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
1093; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 0
1094; SDAG-NEXT:    s_nop 0
1095; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
1096; SDAG-NEXT:    v_mov_b32_e32 v0, s16
1097; SDAG-NEXT:    v_mov_b32_e32 v1, s17
1098; SDAG-NEXT:    v_mov_b32_e32 v2, s18
1099; SDAG-NEXT:    v_mov_b32_e32 v3, s19
1100; SDAG-NEXT:    s_nop 6
1101; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
1102; SDAG-NEXT:    s_waitcnt vmcnt(0)
1103; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
1104; SDAG-NEXT:    s_waitcnt vmcnt(0)
1105; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
1106; SDAG-NEXT:    s_waitcnt vmcnt(0)
1107; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
1108; SDAG-NEXT:    s_waitcnt vmcnt(0)
1109; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
1110; SDAG-NEXT:    s_waitcnt vmcnt(0)
1111; SDAG-NEXT:    s_nop 0
1112; SDAG-NEXT:    v_mov_b32_e32 v0, s20
1113; SDAG-NEXT:    v_mov_b32_e32 v1, s21
1114; SDAG-NEXT:    v_mov_b32_e32 v2, s22
1115; SDAG-NEXT:    v_mov_b32_e32 v3, s23
1116; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
1117; SDAG-NEXT:    s_waitcnt vmcnt(0)
1118; SDAG-NEXT:    s_nop 0
1119; SDAG-NEXT:    v_mov_b32_e32 v0, s8
1120; SDAG-NEXT:    v_mov_b32_e32 v1, s9
1121; SDAG-NEXT:    v_mov_b32_e32 v2, s10
1122; SDAG-NEXT:    v_mov_b32_e32 v3, s11
1123; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
1124; SDAG-NEXT:    s_waitcnt vmcnt(0)
1125; SDAG-NEXT:    s_nop 0
1126; SDAG-NEXT:    v_mov_b32_e32 v0, s12
1127; SDAG-NEXT:    v_mov_b32_e32 v1, s13
1128; SDAG-NEXT:    v_mov_b32_e32 v2, s14
1129; SDAG-NEXT:    v_mov_b32_e32 v3, s15
1130; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
1131; SDAG-NEXT:    s_waitcnt vmcnt(0)
1132; SDAG-NEXT:    s_endpgm
1133;
1134; GISEL-LABEL: test_mfma_i32_32x32x32_i8:
1135; GISEL:       ; %bb.0:
1136; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
1137; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1138; GISEL-NEXT:    v_mov_b64_e32 v[20:21], 0
1139; GISEL-NEXT:    v_mov_b64_e32 v[26:27], 48
1140; GISEL-NEXT:    v_mov_b64_e32 v[22:23], 16
1141; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1142; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
1143; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
1144; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
1145; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
1146; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
1147; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
1148; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
1149; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
1150; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
1151; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
1152; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
1153; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
1154; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
1155; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
1156; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
1157; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
1158; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
1159; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
1160; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
1161; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
1162; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
1163; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
1164; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
1165; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
1166; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
1167; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[22:23]
1168; GISEL-NEXT:    v_mov_b64_e32 v[24:25], 32
1169; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
1170; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
1171; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
1172; GISEL-NEXT:    s_nop 3
1173; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
1174; GISEL-NEXT:    s_waitcnt vmcnt(0)
1175; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
1176; GISEL-NEXT:    s_waitcnt vmcnt(0)
1177; GISEL-NEXT:    global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
1178; GISEL-NEXT:    s_waitcnt vmcnt(0)
1179; GISEL-NEXT:    global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
1180; GISEL-NEXT:    s_waitcnt vmcnt(0)
1181; GISEL-NEXT:    global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
1182; GISEL-NEXT:    s_waitcnt vmcnt(0)
1183; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
1184; GISEL-NEXT:    s_waitcnt vmcnt(0)
1185; GISEL-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
1186; GISEL-NEXT:    s_waitcnt vmcnt(0)
1187; GISEL-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
1188; GISEL-NEXT:    s_waitcnt vmcnt(0)
1189; GISEL-NEXT:    s_endpgm
1190  %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
1191  store volatile <16 x i32> %result, ptr addrspace(1) null
1192  store volatile <16 x i32> %arg2, ptr addrspace(1) null
1193  ret void
1194}
1195
1196define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) #1 {
1197; SDAG-LABEL: test_mfma_i32_32x32x32_i8__flags:
1198; SDAG:       ; %bb.0:
1199; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
1200; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1201; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 48
1202; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 32
1203; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 16
1204; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1205; SDAG-NEXT:    v_mov_b32_e32 v0, s24
1206; SDAG-NEXT:    v_mov_b32_e32 v1, s25
1207; SDAG-NEXT:    v_mov_b32_e32 v2, s26
1208; SDAG-NEXT:    v_mov_b32_e32 v3, s27
1209; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
1210; SDAG-NEXT:    v_mov_b32_e32 v4, s28
1211; SDAG-NEXT:    v_mov_b32_e32 v5, s29
1212; SDAG-NEXT:    v_mov_b32_e32 v6, s30
1213; SDAG-NEXT:    v_mov_b32_e32 v7, s31
1214; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
1215; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
1216; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
1217; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
1218; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
1219; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
1220; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
1221; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
1222; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
1223; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
1224; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
1225; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
1226; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
1227; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
1228; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
1229; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 0
1230; SDAG-NEXT:    s_nop 0
1231; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
1232; SDAG-NEXT:    v_mov_b32_e32 v0, s16
1233; SDAG-NEXT:    v_mov_b32_e32 v1, s17
1234; SDAG-NEXT:    v_mov_b32_e32 v2, s18
1235; SDAG-NEXT:    v_mov_b32_e32 v3, s19
1236; SDAG-NEXT:    s_nop 6
1237; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
1238; SDAG-NEXT:    s_waitcnt vmcnt(0)
1239; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
1240; SDAG-NEXT:    s_waitcnt vmcnt(0)
1241; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
1242; SDAG-NEXT:    s_waitcnt vmcnt(0)
1243; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
1244; SDAG-NEXT:    s_waitcnt vmcnt(0)
1245; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
1246; SDAG-NEXT:    s_waitcnt vmcnt(0)
1247; SDAG-NEXT:    s_nop 0
1248; SDAG-NEXT:    v_mov_b32_e32 v0, s20
1249; SDAG-NEXT:    v_mov_b32_e32 v1, s21
1250; SDAG-NEXT:    v_mov_b32_e32 v2, s22
1251; SDAG-NEXT:    v_mov_b32_e32 v3, s23
1252; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
1253; SDAG-NEXT:    s_waitcnt vmcnt(0)
1254; SDAG-NEXT:    s_nop 0
1255; SDAG-NEXT:    v_mov_b32_e32 v0, s8
1256; SDAG-NEXT:    v_mov_b32_e32 v1, s9
1257; SDAG-NEXT:    v_mov_b32_e32 v2, s10
1258; SDAG-NEXT:    v_mov_b32_e32 v3, s11
1259; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
1260; SDAG-NEXT:    s_waitcnt vmcnt(0)
1261; SDAG-NEXT:    s_nop 0
1262; SDAG-NEXT:    v_mov_b32_e32 v0, s12
1263; SDAG-NEXT:    v_mov_b32_e32 v1, s13
1264; SDAG-NEXT:    v_mov_b32_e32 v2, s14
1265; SDAG-NEXT:    v_mov_b32_e32 v3, s15
1266; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
1267; SDAG-NEXT:    s_waitcnt vmcnt(0)
1268; SDAG-NEXT:    s_endpgm
1269;
1270; GISEL-LABEL: test_mfma_i32_32x32x32_i8__flags:
1271; GISEL:       ; %bb.0:
1272; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
1273; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1274; GISEL-NEXT:    v_mov_b64_e32 v[20:21], 0
1275; GISEL-NEXT:    v_mov_b64_e32 v[26:27], 48
1276; GISEL-NEXT:    v_mov_b64_e32 v[22:23], 16
1277; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1278; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
1279; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
1280; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
1281; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
1282; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
1283; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
1284; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
1285; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
1286; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
1287; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
1288; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
1289; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
1290; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
1291; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
1292; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
1293; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
1294; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
1295; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
1296; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
1297; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
1298; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
1299; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
1300; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
1301; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
1302; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
1303; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[22:23]
1304; GISEL-NEXT:    v_mov_b64_e32 v[24:25], 32
1305; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
1306; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
1307; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
1308; GISEL-NEXT:    s_nop 3
1309; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
1310; GISEL-NEXT:    s_waitcnt vmcnt(0)
1311; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
1312; GISEL-NEXT:    s_waitcnt vmcnt(0)
1313; GISEL-NEXT:    global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
1314; GISEL-NEXT:    s_waitcnt vmcnt(0)
1315; GISEL-NEXT:    global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
1316; GISEL-NEXT:    s_waitcnt vmcnt(0)
1317; GISEL-NEXT:    global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
1318; GISEL-NEXT:    s_waitcnt vmcnt(0)
1319; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
1320; GISEL-NEXT:    s_waitcnt vmcnt(0)
1321; GISEL-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
1322; GISEL-NEXT:    s_waitcnt vmcnt(0)
1323; GISEL-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
1324; GISEL-NEXT:    s_waitcnt vmcnt(0)
1325; GISEL-NEXT:    s_endpgm
1326  %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 2, i32 3, i32 1)
1327  store volatile <16 x i32> %result, ptr addrspace(1) null
1328  store volatile <16 x i32> %arg2, ptr addrspace(1) null
1329  ret void
1330}
1331
1332define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) {
1333; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac:
1334; GCN:       ; %bb.0:
1335; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
1337; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
1338; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
1339; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
1340; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
1341; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
1342; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
1343; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
1344; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
1345; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
1346; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
1347; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
1348; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
1349; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
1350; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
1351; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
1352; GCN-NEXT:    s_nop 1
1353; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
1354; GCN-NEXT:    s_nop 7
1355; GCN-NEXT:    s_nop 2
1356; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
1357; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
1358; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
1359; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
1360; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
1361; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
1362; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
1363; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
1364; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
1365; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
1366; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
1367; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
1368; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
1369; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
1370; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
1371; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
1372; GCN-NEXT:    s_setpc_b64 s[30:31]
1373  %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
1374  ret <16 x i32> %result
1375}
1376
1377define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) {
1378; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
1379; GCN:       ; %bb.0:
1380; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1381; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
1382; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
1383; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
1384; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
1385; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
1386; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
1387; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
1388; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
1389; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
1390; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
1391; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
1392; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
1393; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
1394; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
1395; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
1396; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
1397; GCN-NEXT:    s_nop 1
1398; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
1399; GCN-NEXT:    s_nop 7
1400; GCN-NEXT:    s_nop 2
1401; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
1402; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
1403; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
1404; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
1405; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
1406; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
1407; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
1408; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
1409; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
1410; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
1411; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
1412; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
1413; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
1414; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
1415; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
1416; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
1417; GCN-NEXT:    s_setpc_b64 s[30:31]
1418  %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 1, i32 1)
1419  ret <16 x i32> %result
1420}
1421
1422define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 {
1423; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
1424; SDAG:       ; %bb.0:
1425; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
1426; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
1427; SDAG-NEXT:    v_mov_b32_e32 v8, 0
1428; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1429; SDAG-NEXT:    v_mov_b32_e32 v0, s20
1430; SDAG-NEXT:    v_mov_b32_e32 v1, s21
1431; SDAG-NEXT:    v_mov_b32_e32 v2, s22
1432; SDAG-NEXT:    v_mov_b32_e32 v3, s23
1433; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1434; SDAG-NEXT:    v_mov_b32_e32 v4, s24
1435; SDAG-NEXT:    v_mov_b32_e32 v5, s25
1436; SDAG-NEXT:    v_mov_b32_e32 v6, s26
1437; SDAG-NEXT:    v_mov_b32_e32 v7, s27
1438; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1439; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
1440; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
1441; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
1442; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
1443; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
1444; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
1445; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
1446; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
1447; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
1448; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
1449; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
1450; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
1451; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
1452; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
1453; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
1454; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
1455; SDAG-NEXT:    s_nop 1
1456; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
1457; SDAG-NEXT:    v_mov_b32_e32 v0, s20
1458; SDAG-NEXT:    v_mov_b32_e32 v1, s21
1459; SDAG-NEXT:    v_mov_b32_e32 v2, s22
1460; SDAG-NEXT:    v_mov_b32_e32 v3, s23
1461; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
1462; SDAG-NEXT:    s_waitcnt vmcnt(0)
1463; SDAG-NEXT:    s_nop 0
1464; SDAG-NEXT:    v_mov_b32_e32 v0, s16
1465; SDAG-NEXT:    v_mov_b32_e32 v1, s17
1466; SDAG-NEXT:    v_mov_b32_e32 v2, s18
1467; SDAG-NEXT:    v_mov_b32_e32 v3, s19
1468; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
1469; SDAG-NEXT:    s_waitcnt vmcnt(0)
1470; SDAG-NEXT:    s_nop 0
1471; SDAG-NEXT:    v_mov_b32_e32 v0, s12
1472; SDAG-NEXT:    v_mov_b32_e32 v1, s13
1473; SDAG-NEXT:    v_mov_b32_e32 v2, s14
1474; SDAG-NEXT:    v_mov_b32_e32 v3, s15
1475; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
1476; SDAG-NEXT:    s_waitcnt vmcnt(0)
1477; SDAG-NEXT:    s_nop 0
1478; SDAG-NEXT:    v_mov_b32_e32 v0, s8
1479; SDAG-NEXT:    v_mov_b32_e32 v1, s9
1480; SDAG-NEXT:    v_mov_b32_e32 v2, s10
1481; SDAG-NEXT:    v_mov_b32_e32 v3, s11
1482; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
1483; SDAG-NEXT:    s_waitcnt vmcnt(0)
1484; SDAG-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
1485; SDAG-NEXT:    s_waitcnt vmcnt(0)
1486; SDAG-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
1487; SDAG-NEXT:    s_waitcnt vmcnt(0)
1488; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
1489; SDAG-NEXT:    s_waitcnt vmcnt(0)
1490; SDAG-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
1491; SDAG-NEXT:    s_waitcnt vmcnt(0)
1492; SDAG-NEXT:    s_endpgm
1493;
1494; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
1495; GISEL:       ; %bb.0:
1496; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
1497; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1498; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
1499; GISEL-NEXT:    v_mov_b32_e32 v24, 0
1500; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1501; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
1502; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
1503; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
1504; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
1505; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
1506; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
1507; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
1508; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
1509; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
1510; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
1511; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
1512; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
1513; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
1514; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
1515; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
1516; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
1517; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
1518; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
1519; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
1520; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
1521; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
1522; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
1523; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
1524; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
1525; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
1526; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
1527; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
1528; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
1529; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
1530; GISEL-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
1531; GISEL-NEXT:    s_waitcnt vmcnt(0)
1532; GISEL-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
1533; GISEL-NEXT:    s_waitcnt vmcnt(0)
1534; GISEL-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
1535; GISEL-NEXT:    s_waitcnt vmcnt(0)
1536; GISEL-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
1537; GISEL-NEXT:    s_waitcnt vmcnt(0)
1538; GISEL-NEXT:    global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
1539; GISEL-NEXT:    s_waitcnt vmcnt(0)
1540; GISEL-NEXT:    global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
1541; GISEL-NEXT:    s_waitcnt vmcnt(0)
1542; GISEL-NEXT:    global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
1543; GISEL-NEXT:    s_waitcnt vmcnt(0)
1544; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
1545; GISEL-NEXT:    s_waitcnt vmcnt(0)
1546; GISEL-NEXT:    s_endpgm
1547  %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
1548  store volatile <16 x i32> %arg2, ptr addrspace(1) %out
1549  store volatile <16 x i32> %result, ptr addrspace(1) %out
1550  ret void
1551}
1552
1553define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 {
1554; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
1555; SDAG:       ; %bb.0:
1556; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
1557; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
1558; SDAG-NEXT:    v_mov_b32_e32 v8, 0
1559; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1560; SDAG-NEXT:    v_mov_b32_e32 v0, s20
1561; SDAG-NEXT:    v_mov_b32_e32 v1, s21
1562; SDAG-NEXT:    v_mov_b32_e32 v2, s22
1563; SDAG-NEXT:    v_mov_b32_e32 v3, s23
1564; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1565; SDAG-NEXT:    v_mov_b32_e32 v4, s24
1566; SDAG-NEXT:    v_mov_b32_e32 v5, s25
1567; SDAG-NEXT:    v_mov_b32_e32 v6, s26
1568; SDAG-NEXT:    v_mov_b32_e32 v7, s27
1569; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1570; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
1571; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
1572; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
1573; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
1574; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
1575; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
1576; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
1577; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
1578; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
1579; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
1580; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
1581; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
1582; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
1583; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
1584; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
1585; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
1586; SDAG-NEXT:    s_nop 1
1587; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
1588; SDAG-NEXT:    v_mov_b32_e32 v0, s20
1589; SDAG-NEXT:    v_mov_b32_e32 v1, s21
1590; SDAG-NEXT:    v_mov_b32_e32 v2, s22
1591; SDAG-NEXT:    v_mov_b32_e32 v3, s23
1592; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
1593; SDAG-NEXT:    s_waitcnt vmcnt(0)
1594; SDAG-NEXT:    s_nop 0
1595; SDAG-NEXT:    v_mov_b32_e32 v0, s16
1596; SDAG-NEXT:    v_mov_b32_e32 v1, s17
1597; SDAG-NEXT:    v_mov_b32_e32 v2, s18
1598; SDAG-NEXT:    v_mov_b32_e32 v3, s19
1599; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
1600; SDAG-NEXT:    s_waitcnt vmcnt(0)
1601; SDAG-NEXT:    s_nop 0
1602; SDAG-NEXT:    v_mov_b32_e32 v0, s12
1603; SDAG-NEXT:    v_mov_b32_e32 v1, s13
1604; SDAG-NEXT:    v_mov_b32_e32 v2, s14
1605; SDAG-NEXT:    v_mov_b32_e32 v3, s15
1606; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
1607; SDAG-NEXT:    s_waitcnt vmcnt(0)
1608; SDAG-NEXT:    s_nop 0
1609; SDAG-NEXT:    v_mov_b32_e32 v0, s8
1610; SDAG-NEXT:    v_mov_b32_e32 v1, s9
1611; SDAG-NEXT:    v_mov_b32_e32 v2, s10
1612; SDAG-NEXT:    v_mov_b32_e32 v3, s11
1613; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
1614; SDAG-NEXT:    s_waitcnt vmcnt(0)
1615; SDAG-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
1616; SDAG-NEXT:    s_waitcnt vmcnt(0)
1617; SDAG-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
1618; SDAG-NEXT:    s_waitcnt vmcnt(0)
1619; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
1620; SDAG-NEXT:    s_waitcnt vmcnt(0)
1621; SDAG-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
1622; SDAG-NEXT:    s_waitcnt vmcnt(0)
1623; SDAG-NEXT:    s_endpgm
1624;
1625; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
1626; GISEL:       ; %bb.0:
1627; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
1628; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1629; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
1630; GISEL-NEXT:    v_mov_b32_e32 v24, 0
1631; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1632; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
1633; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
1634; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
1635; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
1636; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
1637; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
1638; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
1639; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
1640; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
1641; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
1642; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
1643; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
1644; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
1645; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
1646; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
1647; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
1648; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
1649; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
1650; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
1651; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
1652; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
1653; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
1654; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
1655; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
1656; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
1657; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
1658; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
1659; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
1660; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
1661; GISEL-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
1662; GISEL-NEXT:    s_waitcnt vmcnt(0)
1663; GISEL-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
1664; GISEL-NEXT:    s_waitcnt vmcnt(0)
1665; GISEL-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
1666; GISEL-NEXT:    s_waitcnt vmcnt(0)
1667; GISEL-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
1668; GISEL-NEXT:    s_waitcnt vmcnt(0)
1669; GISEL-NEXT:    global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
1670; GISEL-NEXT:    s_waitcnt vmcnt(0)
1671; GISEL-NEXT:    global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
1672; GISEL-NEXT:    s_waitcnt vmcnt(0)
1673; GISEL-NEXT:    global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
1674; GISEL-NEXT:    s_waitcnt vmcnt(0)
1675; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
1676; GISEL-NEXT:    s_waitcnt vmcnt(0)
1677; GISEL-NEXT:    s_endpgm
1678  %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 2, i32 3)
1679  store volatile <16 x i32> %arg2, ptr addrspace(1) %out
1680  store volatile <16 x i32> %result, ptr addrspace(1) %out
1681  ret void
1682}
1683
1684define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 {
1685; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
1686; SDAG:       ; %bb.0:
1687; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
1688; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
1689; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1690; SDAG-NEXT:    v_mov_b32_e32 v0, s20
1691; SDAG-NEXT:    v_mov_b32_e32 v1, s21
1692; SDAG-NEXT:    v_mov_b32_e32 v2, s22
1693; SDAG-NEXT:    v_mov_b32_e32 v3, s23
1694; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1695; SDAG-NEXT:    v_mov_b32_e32 v4, s24
1696; SDAG-NEXT:    v_mov_b32_e32 v5, s25
1697; SDAG-NEXT:    v_mov_b32_e32 v6, s26
1698; SDAG-NEXT:    v_mov_b32_e32 v7, s27
1699; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1700; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
1701; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
1702; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
1703; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
1704; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
1705; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
1706; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
1707; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
1708; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
1709; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
1710; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
1711; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
1712; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
1713; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
1714; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
1715; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
1716; SDAG-NEXT:    s_nop 1
1717; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
1718; SDAG-NEXT:    v_mov_b32_e32 v0, 0
1719; SDAG-NEXT:    s_nop 7
1720; SDAG-NEXT:    s_nop 1
1721; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
1722; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
1723; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
1724; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
1725; SDAG-NEXT:    s_endpgm
1726;
1727; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
1728; GISEL:       ; %bb.0:
1729; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
1730; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1731; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
1732; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1733; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
1734; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
1735; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
1736; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
1737; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
1738; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
1739; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
1740; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
1741; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
1742; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
1743; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
1744; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
1745; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
1746; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
1747; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
1748; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
1749; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
1750; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
1751; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
1752; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
1753; GISEL-NEXT:    s_nop 1
1754; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
1755; GISEL-NEXT:    v_mov_b32_e32 v0, 0
1756; GISEL-NEXT:    s_nop 7
1757; GISEL-NEXT:    s_nop 1
1758; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
1759; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
1760; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
1761; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
1762; GISEL-NEXT:    s_endpgm
1763  %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
1764  store <16 x i32> %result, ptr addrspace(1) %out
1765  ret void
1766}
1767
1768define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 {
1769; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
1770; SDAG:       ; %bb.0:
1771; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
1772; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
1773; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1774; SDAG-NEXT:    v_mov_b32_e32 v0, s20
1775; SDAG-NEXT:    v_mov_b32_e32 v1, s21
1776; SDAG-NEXT:    v_mov_b32_e32 v2, s22
1777; SDAG-NEXT:    v_mov_b32_e32 v3, s23
1778; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1779; SDAG-NEXT:    v_mov_b32_e32 v4, s24
1780; SDAG-NEXT:    v_mov_b32_e32 v5, s25
1781; SDAG-NEXT:    v_mov_b32_e32 v6, s26
1782; SDAG-NEXT:    v_mov_b32_e32 v7, s27
1783; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1784; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
1785; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
1786; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
1787; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
1788; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
1789; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
1790; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
1791; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
1792; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
1793; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
1794; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
1795; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
1796; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
1797; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
1798; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
1799; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
1800; SDAG-NEXT:    s_nop 1
1801; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
1802; SDAG-NEXT:    v_mov_b32_e32 v0, 0
1803; SDAG-NEXT:    s_nop 7
1804; SDAG-NEXT:    s_nop 1
1805; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
1806; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
1807; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
1808; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
1809; SDAG-NEXT:    s_endpgm
1810;
1811; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
1812; GISEL:       ; %bb.0:
1813; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
1814; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
1815; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
1816; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1817; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
1818; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
1819; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
1820; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
1821; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
1822; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
1823; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
1824; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
1825; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
1826; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
1827; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
1828; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
1829; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
1830; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
1831; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
1832; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
1833; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
1834; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
1835; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
1836; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
1837; GISEL-NEXT:    s_nop 1
1838; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
1839; GISEL-NEXT:    v_mov_b32_e32 v0, 0
1840; GISEL-NEXT:    s_nop 7
1841; GISEL-NEXT:    s_nop 1
1842; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
1843; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
1844; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
1845; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
1846; GISEL-NEXT:    s_endpgm
1847  %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 3, i32 2, i32 1)
1848  store <16 x i32> %result, ptr addrspace(1) %out
1849  ret void
1850}
1851
1852; --------------------------------------------------------------------
1853; llvm.amdgcn.mfma.f32.16x16x32.bf16
1854; --------------------------------------------------------------------
1855
1856declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
1857
1858define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
1859; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
1860; GCN:       ; %bb.0:
1861; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1862; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
1863; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
1864; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
1865; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
1866; GCN-NEXT:    s_nop 1
1867; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
1868; GCN-NEXT:    s_nop 6
1869; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
1870; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
1871; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
1872; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
1873; GCN-NEXT:    s_setpc_b64 s[30:31]
1874  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
1875  ret <4 x float> %result
1876}
1877
1878define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
1879; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
1880; GCN:       ; %bb.0:
1881; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1882; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
1883; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
1884; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
1885; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
1886; GCN-NEXT:    s_nop 1
1887; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
1888; GCN-NEXT:    s_nop 6
1889; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
1890; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
1891; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
1892; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
1893; GCN-NEXT:    s_setpc_b64 s[30:31]
1894  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
1895  ret <4 x float> %result
1896}
1897
1898define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
1899; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
1900; GCN:       ; %bb.0:
1901; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
1902; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
1903; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1904; GCN-NEXT:    v_mov_b32_e32 v8, 0
1905; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1906; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
1907; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
1908; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
1909; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
1910; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
1911; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
1912; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
1913; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
1914; GCN-NEXT:    s_nop 1
1915; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
1916; GCN-NEXT:    s_nop 6
1917; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
1918; GCN-NEXT:    s_endpgm
1919  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
1920  store <4 x float> %result, ptr addrspace(1) %out
1921  ret void
1922}
1923
1924define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
1925; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
1926; GCN:       ; %bb.0:
1927; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
1928; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
1929; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1930; GCN-NEXT:    v_mov_b32_e32 v8, 0
1931; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1932; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
1933; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
1934; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
1935; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
1936; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
1937; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
1938; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
1939; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
1940; GCN-NEXT:    s_nop 1
1941; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
1942; GCN-NEXT:    s_nop 6
1943; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
1944; GCN-NEXT:    s_endpgm
1945  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
1946  store <4 x float> %result, ptr addrspace(1) %out
1947  ret void
1948}
1949
1950attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
1951attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
1952