xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s
3
4declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
5declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
6declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
7declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
8declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
9declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
10declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
11declare i32 @llvm.amdgcn.workitem.id.x()
12
13define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 {
14; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k:
15; GCN:       ; %bb.0: ; %bb
16; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
17; GCN-NEXT:    s_mov_b64 s[36:37], 1
18; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
19; GCN-NEXT:    s_mov_b32 s36, 2
20; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1]
21; GCN-NEXT:    s_waitcnt lgkmcnt(0)
22; GCN-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x0
23; GCN-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
24; GCN-NEXT:    s_waitcnt lgkmcnt(0)
25; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
26; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
27; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
28; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
29; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
30; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
31; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
32; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
33; GCN-NEXT:    v_accvgpr_write_b32 a8, s8
34; GCN-NEXT:    v_accvgpr_write_b32 a9, s9
35; GCN-NEXT:    v_accvgpr_write_b32 a10, s10
36; GCN-NEXT:    v_accvgpr_write_b32 a11, s11
37; GCN-NEXT:    v_accvgpr_write_b32 a12, s12
38; GCN-NEXT:    v_accvgpr_write_b32 a13, s13
39; GCN-NEXT:    v_accvgpr_write_b32 a14, s14
40; GCN-NEXT:    v_accvgpr_write_b32 a15, s15
41; GCN-NEXT:    v_accvgpr_write_b32 a16, s16
42; GCN-NEXT:    v_accvgpr_write_b32 a17, s17
43; GCN-NEXT:    v_accvgpr_write_b32 a18, s18
44; GCN-NEXT:    v_accvgpr_write_b32 a19, s19
45; GCN-NEXT:    v_accvgpr_write_b32 a20, s20
46; GCN-NEXT:    v_accvgpr_write_b32 a21, s21
47; GCN-NEXT:    v_accvgpr_write_b32 a22, s22
48; GCN-NEXT:    v_accvgpr_write_b32 a23, s23
49; GCN-NEXT:    v_accvgpr_write_b32 a24, s24
50; GCN-NEXT:    v_accvgpr_write_b32 a25, s25
51; GCN-NEXT:    v_accvgpr_write_b32 a26, s26
52; GCN-NEXT:    v_accvgpr_write_b32 a27, s27
53; GCN-NEXT:    v_accvgpr_write_b32 a28, s28
54; GCN-NEXT:    v_accvgpr_write_b32 a29, s29
55; GCN-NEXT:    v_accvgpr_write_b32 a30, s30
56; GCN-NEXT:    v_accvgpr_write_b32 a31, s31
57; GCN-NEXT:    s_nop 1
58; GCN-NEXT:    v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
59; GCN-NEXT:    v_mov_b32_e32 v0, 0
60; GCN-NEXT:    s_nop 7
61; GCN-NEXT:    s_nop 7
62; GCN-NEXT:    s_nop 1
63; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
64; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
65; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
66; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
67; GCN-NEXT:    global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
68; GCN-NEXT:    global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
69; GCN-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
70; GCN-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
71; GCN-NEXT:    s_endpgm
72bb:
73  %in.1 = load <32 x float>, ptr addrspace(1) %arg
74  %a = bitcast i64 1 to <4 x i16>
75  %b = bitcast i64 2 to <4 x i16>
76  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
77  store <32 x float> %mai.1, ptr addrspace(1) %arg
78  ret void
79}
80
81define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 {
82; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k:
83; GCN:       ; %bb.0: ; %bb
84; GCN-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
85; GCN-NEXT:    s_mov_b64 s[18:19], 1
86; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
87; GCN-NEXT:    s_mov_b32 s18, 2
88; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
89; GCN-NEXT:    s_waitcnt lgkmcnt(0)
90; GCN-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
91; GCN-NEXT:    s_waitcnt lgkmcnt(0)
92; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
93; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
94; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
95; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
96; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
97; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
98; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
99; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
100; GCN-NEXT:    v_accvgpr_write_b32 a8, s8
101; GCN-NEXT:    v_accvgpr_write_b32 a9, s9
102; GCN-NEXT:    v_accvgpr_write_b32 a10, s10
103; GCN-NEXT:    v_accvgpr_write_b32 a11, s11
104; GCN-NEXT:    v_accvgpr_write_b32 a12, s12
105; GCN-NEXT:    v_accvgpr_write_b32 a13, s13
106; GCN-NEXT:    v_accvgpr_write_b32 a14, s14
107; GCN-NEXT:    v_accvgpr_write_b32 a15, s15
108; GCN-NEXT:    s_nop 1
109; GCN-NEXT:    v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
110; GCN-NEXT:    v_mov_b32_e32 v0, 0
111; GCN-NEXT:    s_nop 7
112; GCN-NEXT:    s_nop 1
113; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
114; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
115; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
116; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
117; GCN-NEXT:    s_endpgm
118bb:
119  %in.1 = load <16 x float>, ptr addrspace(1) %arg
120  %a = bitcast i64 1 to <4 x i16>
121  %b = bitcast i64 2 to <4 x i16>
122  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
123  store <16 x float> %mai.1, ptr addrspace(1) %arg
124  ret void
125}
126
127define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 {
128; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k:
129; GCN:       ; %bb.0: ; %bb
130; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
131; GCN-NEXT:    s_mov_b64 s[4:5], 1
132; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
133; GCN-NEXT:    s_mov_b32 s4, 2
134; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
135; GCN-NEXT:    s_waitcnt lgkmcnt(0)
136; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
137; GCN-NEXT:    s_waitcnt lgkmcnt(0)
138; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
139; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
140; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
141; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
142; GCN-NEXT:    s_nop 1
143; GCN-NEXT:    v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
144; GCN-NEXT:    v_mov_b32_e32 v0, 0
145; GCN-NEXT:    s_nop 3
146; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
147; GCN-NEXT:    s_endpgm
148bb:
149  %in.1 = load <4 x float>, ptr addrspace(1) %arg
150  %a = bitcast i64 1 to <4 x i16>
151  %b = bitcast i64 2 to <4 x i16>
152  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
153  store <4 x float> %mai.1, ptr addrspace(1) %arg
154  ret void
155}
156
157define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 {
158; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k:
159; GCN:       ; %bb.0: ; %bb
160; GCN-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
161; GCN-NEXT:    s_mov_b64 s[18:19], 1
162; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
163; GCN-NEXT:    s_mov_b32 s18, 2
164; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
165; GCN-NEXT:    s_waitcnt lgkmcnt(0)
166; GCN-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
167; GCN-NEXT:    s_waitcnt lgkmcnt(0)
168; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
169; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
170; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
171; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
172; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
173; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
174; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
175; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
176; GCN-NEXT:    v_accvgpr_write_b32 a8, s8
177; GCN-NEXT:    v_accvgpr_write_b32 a9, s9
178; GCN-NEXT:    v_accvgpr_write_b32 a10, s10
179; GCN-NEXT:    v_accvgpr_write_b32 a11, s11
180; GCN-NEXT:    v_accvgpr_write_b32 a12, s12
181; GCN-NEXT:    v_accvgpr_write_b32 a13, s13
182; GCN-NEXT:    v_accvgpr_write_b32 a14, s14
183; GCN-NEXT:    v_accvgpr_write_b32 a15, s15
184; GCN-NEXT:    s_nop 1
185; GCN-NEXT:    v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
186; GCN-NEXT:    v_mov_b32_e32 v0, 0
187; GCN-NEXT:    s_nop 7
188; GCN-NEXT:    s_nop 7
189; GCN-NEXT:    s_nop 1
190; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
191; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
192; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
193; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
194; GCN-NEXT:    s_endpgm
195bb:
196  %in.1 = load <16 x float>, ptr addrspace(1) %arg
197  %a = bitcast i64 1 to <4 x i16>
198  %b = bitcast i64 2 to <4 x i16>
199  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
200  store <16 x float> %mai.1, ptr addrspace(1) %arg
201  ret void
202}
203
204define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 {
205; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k:
206; GCN:       ; %bb.0: ; %bb
207; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
208; GCN-NEXT:    s_mov_b64 s[4:5], 1
209; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
210; GCN-NEXT:    s_mov_b32 s4, 2
211; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
212; GCN-NEXT:    s_waitcnt lgkmcnt(0)
213; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
214; GCN-NEXT:    s_waitcnt lgkmcnt(0)
215; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
216; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
217; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
218; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
219; GCN-NEXT:    s_nop 1
220; GCN-NEXT:    v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
221; GCN-NEXT:    v_mov_b32_e32 v0, 0
222; GCN-NEXT:    s_nop 7
223; GCN-NEXT:    s_nop 1
224; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
225; GCN-NEXT:    s_endpgm
226bb:
227  %in.1 = load <4 x float>, ptr addrspace(1) %arg
228  %a = bitcast i64 1 to <4 x i16>
229  %b = bitcast i64 2 to <4 x i16>
230  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
231  store <4 x float> %mai.1, ptr addrspace(1) %arg
232  ret void
233}
234
235define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
236; GCN-LABEL: test_mfma_f64_4x4x4f64:
237; GCN:       ; %bb.0: ; %bb
238; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
239; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
240; GCN-NEXT:    s_waitcnt lgkmcnt(0)
241; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
242; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
243; GCN-NEXT:    s_nop 1
244; GCN-NEXT:    v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
245; GCN-NEXT:    s_nop 3
246; GCN-NEXT:    v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
247; GCN-NEXT:    v_mov_b32_e32 v0, 0
248; GCN-NEXT:    s_nop 7
249; GCN-NEXT:    global_store_dwordx2 v0, a[0:1], s[0:1]
250; GCN-NEXT:    s_endpgm
251bb:
252  %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
253  %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
254  store double %mai.2, ptr addrspace(1) %arg
255  ret void
256}
257
258define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
259; GCN-LABEL: test_mfma_f64_16x16x4f64:
260; GCN:       ; %bb.0: ; %bb
261; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
262; GCN-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
263; GCN-NEXT:    s_waitcnt lgkmcnt(0)
264; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
265; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
266; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
267; GCN-NEXT:    s_waitcnt lgkmcnt(0)
268; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
269; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
270; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
271; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
272; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
273; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
274; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
275; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
276; GCN-NEXT:    s_nop 1
277; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
278; GCN-NEXT:    v_mov_b32_e32 v0, 0
279; GCN-NEXT:    s_nop 7
280; GCN-NEXT:    s_nop 7
281; GCN-NEXT:    s_nop 0
282; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
283; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
284; GCN-NEXT:    s_endpgm
285bb:
286  %in.1 = load <4 x double>, ptr addrspace(1) %arg
287  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
288  store <4 x double> %mai.1, ptr addrspace(1) %arg
289  ret void
290}
291
292define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
293; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm:
294; GCN:       ; %bb.0: ; %bb
295; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
296; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
297; GCN-NEXT:    s_waitcnt lgkmcnt(0)
298; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
299; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
300; GCN-NEXT:    s_nop 1
301; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
302; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
303; GCN-NEXT:    v_mov_b32_e32 v0, 0
304; GCN-NEXT:    s_nop 7
305; GCN-NEXT:    s_nop 7
306; GCN-NEXT:    s_nop 0
307; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
308; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
309; GCN-NEXT:    s_endpgm
310bb:
311  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
312  %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
313  store <4 x double> %mai.2, ptr addrspace(1) %arg
314  ret void
315}
316
317define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
318; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
319; GCN:       ; %bb.0: ; %bb
320; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
321; GCN-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
322; GCN-NEXT:    s_mov_b64 s[0:1], 0
323; GCN-NEXT:    s_mov_b64 s[6:7], 1.0
324; GCN-NEXT:    s_mov_b64 s[2:3], s[0:1]
325; GCN-NEXT:    s_waitcnt lgkmcnt(0)
326; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
327; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
328; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
329; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
330; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
331; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
332; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
333; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
334; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
335; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
336; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
337; GCN-NEXT:    s_nop 1
338; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
339; GCN-NEXT:    v_mov_b32_e32 v0, 0
340; GCN-NEXT:    s_nop 7
341; GCN-NEXT:    s_nop 7
342; GCN-NEXT:    s_nop 0
343; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
344; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
345; GCN-NEXT:    s_endpgm
346bb:
347  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
348  store <4 x double> %mai.1, ptr addrspace(1) %arg
349  ret void
350}
351
352define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 {
353; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
354; GCN:       ; %bb.0: ; %bb
355; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
356; GCN-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
357; GCN-NEXT:    s_mov_b32 s0, 0
358; GCN-NEXT:    s_mov_b32 s1, 0x405ec000
359; GCN-NEXT:    s_mov_b64 s[2:3], s[0:1]
360; GCN-NEXT:    s_waitcnt lgkmcnt(0)
361; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
362; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
363; GCN-NEXT:    s_mov_b64 s[6:7], s[0:1]
364; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
365; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
366; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
367; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
368; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
369; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
370; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
371; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
372; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
373; GCN-NEXT:    s_nop 1
374; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
375; GCN-NEXT:    v_mov_b32_e32 v0, 0
376; GCN-NEXT:    s_nop 7
377; GCN-NEXT:    s_nop 7
378; GCN-NEXT:    s_nop 0
379; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
380; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
381; GCN-NEXT:    s_endpgm
382bb:
383  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
384  store <4 x double> %mai.1, ptr addrspace(1) %arg
385  ret void
386}
387
388attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
389