xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mfma-loop.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
2; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s
4
5; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
6
7; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
8
9; Check that we do not copy agprs to vgprs and back inside the loop.
10
11; GCN: [[LOOP:.LBB[0-9_]+]]:
12; GCN-NOT:  v_accvgpr
13; GFX908_A: v_mfma_f32_32x32x1f32
14; GFX940:   v_mfma_f32_32x32x1_2b_f32
15; GCN-NOT:  v_accvgpr
16; GCN:      s_cbranch_scc1 [[LOOP]]
17
18; Final result should be read only once after the loop.
19
20; GFX908-COUNT-32: v_accvgpr_read_b32
21; GFX90A-NOT:      v_accvgpr_read_b32
22; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
23; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
24
25define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
26entry:
27  br label %for.cond.preheader
28
29for.cond.preheader:
30  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
31  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
32  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
33  %inc = add nuw nsw i32 %c, 1
34  %cc = icmp eq i32 %inc, 16
35  br i1 %cc, label %exit, label %for.cond.preheader
36
37exit:
38  store <32 x float> %mai.1, ptr addrspace(1) %arg
39  ret void
40}
41
42; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
43
44; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
45; 3 vgprs are needed to avoid wait states between writes.
46; Check that we do not use 32 temp sgprs as well.
47
48; GCN:          v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
49; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
50
51; GCN: [[LOOP:.LBB[0-9_]+]]:
52; GCN-NOT:  v_accvgpr
53; GFX908_A: v_mfma_f32_32x32x1f32
54; GFX940:   v_mfma_f32_32x32x1_2b_f32
55; GCN-NOT:  v_accvgpr
56; GCN:      s_cbranch_scc1 [[LOOP]]
57
58; GFX908-COUNT-32: v_accvgpr_read_b32
59; GFX90A-NOT:      v_accvgpr_read_b32
60; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
61; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
62
63define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg) #0 {
64entry:
65  br label %for.cond.preheader
66
67for.cond.preheader:
68  %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
69  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
70  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
71  %inc = add nuw nsw i32 %c, 1
72  %cc = icmp eq i32 %inc, 16
73  br i1 %cc, label %exit, label %for.cond.preheader
74
75exit:
76  store <32 x float> %mai.1, ptr addrspace(1) %arg
77  ret void
78}
79
80; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
81
82; GCN-COUNT-31:    v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
83; GCN:             v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
84
85; GCN: [[LOOP:.LBB[0-9_]+]]:
86; GCN-NOT:  v_accvgpr
87; GFX908_A: v_mfma_f32_32x32x1f32
88; GFX940:   v_mfma_f32_32x32x1_2b_f32
89; GCN-NOT:  v_accvgpr
90; GCN:      s_cbranch_scc1 [[LOOP]]
91
92; GFX908-COUNT-32: v_accvgpr_read_b32
93; GFX90A-NOT:      v_accvgpr_read_b32
94; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
95; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
96
97define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
98entry:
99  br label %for.cond.preheader
100
101for.cond.preheader:
102  %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
103  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
104  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
105  %inc = add nuw nsw i32 %c, 1
106  %cc = icmp eq i32 %inc, 16
107  br i1 %cc, label %exit, label %for.cond.preheader
108
109exit:
110  store <32 x float> %mai.1, ptr addrspace(1) %arg
111  ret void
112}
113
114; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
115
116; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
117; 3 vgprs are needed to avoid wait states between writes.
118
119; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
120; GFX-908: v_mov_b32_e32 v0, 0x42f80000
121; GFX-908: s_nop 1
122; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
123; GFX-908: v_mov_b32_e32 v0, 0x42fa0000
124; GFX-908: s_nop 1
125; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
126; GFX-908: v_mov_b32_e32 v0, 0x42fc0000
127; GFX-908: s_nop 1
128; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
129; GFX-908: v_mov_b32_e32 v0, 0x42fe0000
130; GFX-908: s_nop 1
131; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
132; GFX-908: v_mov_b32_e32 v0, 0x43000000
133; GFX-908: s_nop 1
134; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
135; GFX-908: v_mov_b32_e32 v0, 0x43010000
136; GFX-908: s_nop 1
137; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
138; GFX-908: v_mov_b32_e32 v0, 0x43020000
139; GFX-908: s_nop 1
140; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
141; GFX-908: v_mov_b32_e32 v0, 0x43030000
142; GFX-908: s_nop 1
143; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
144; GFX-908: v_mov_b32_e32 v0, 0x43040000
145; GFX-908: s_nop 1
146; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
147; GFX-908: v_mov_b32_e32 v0, 0x43050000
148; GFX-908: s_nop 1
149; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
150; GFX-908: v_mov_b32_e32 v0, 0x43060000
151; GFX-908: s_nop 1
152; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
153; GFX-908: v_mov_b32_e32 v0, 0x43070000
154; GFX-908: s_nop 1
155; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
156; GFX-908: v_mov_b32_e32 v0, 0x43080000
157; GFX-908: s_nop 1
158; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
159; GFX-908: v_mov_b32_e32 v0, 0x43090000
160; GFX-908: s_nop 1
161; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
162; GFX-908: v_mov_b32_e32 v0, 0x430a0000
163; GFX-908: s_nop 1
164; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
165; GFX-908: v_mov_b32_e32 v0, 0x430b0000
166; GFX-908: s_nop 1
167; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
168; GFX-908: v_mov_b32_e32 v0, 0x430c0000
169; GFX-908: s_nop 1
170; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
171; GFX-908: v_mov_b32_e32 v0, 0x430d0000
172; GFX-908: s_nop 1
173; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
174; GFX-908: v_mov_b32_e32 v0, 0x430e0000
175; GFX-908: s_nop 1
176; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
177; GFX-908: v_mov_b32_e32 v0, 0x430f0000
178; GFX-908: s_nop 1
179; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
180; GFX-908: v_mov_b32_e32 v0, 0x43100000
181; GFX-908: s_nop 1
182; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
183; GFX-908: v_mov_b32_e32 v0, 0x43110000
184; GFX-908: s_nop 1
185; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
186; GFX-908: v_mov_b32_e32 v0, 0x43120000
187; GFX-908: s_nop 1
188; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
189; GFX-908: v_mov_b32_e32 v0, 0x43130000
190; GFX-908: s_nop 1
191; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
192; GFX-908: v_mov_b32_e32 v0, 0x43140000
193; GFX-908: s_nop 1
194; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
195; GFX-908: v_mov_b32_e32 v0, 0x43150000
196; GFX-908: s_nop 1
197; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
198; GFX-908: v_mov_b32_e32 v0, 0x43160000
199; GFX-908: s_nop 1
200; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
201; GFX-908: v_mov_b32_e32 v0, 0x43170000
202; GFX-908: s_nop 1
203; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
204; GFX-908: v_mov_b32_e32 v0, 0x43180000
205; GFX-908: s_nop 1
206; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
207; GFX-908: v_mov_b32_e32 v0, 0x43190000
208; GFX-908: s_nop 1
209; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
210; GFX-908: v_mov_b32_e32 v0, 0x431a0000
211; GFX-908: s_nop 1
212; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
213
214; FIXME: Constant is now in VGPR instead of SGPR.
215
216; GFX940_A: v_mov_b32_e32 v{{[0-9]+}}, 0x4{{[0-9a-f]+}}
217; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
218
219; GCN: [[LOOP:.LBB[0-9_]+]]:
220; GCN-NOT:  v_accvgpr
221; GFX908_A: v_mfma_f32_32x32x1f32
222; GFX940:   v_mfma_f32_32x32x1_2b_f32
223; GCN-NOT:  v_accvgpr
224; GCN:      s_cbranch_scc1 [[LOOP]]
225
226; GFX908-COUNT-32: v_accvgpr_read_b32
227; GFX90A-NOT:      v_accvgpr_read_b32
228; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
229; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
230
231define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) #0 {
232entry:
233  br label %for.cond.preheader
234
235for.cond.preheader:
236  %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
237  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
238  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
239  %inc = add nuw nsw i32 %c, 1
240  %cc = icmp eq i32 %inc, 16
241  br i1 %cc, label %exit, label %for.cond.preheader
242
243exit:
244  store <32 x float> %mai.1, ptr addrspace(1) %arg
245  ret void
246}
247
248; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
249
250; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
251
252; GCN: [[LOOP:.LBB[0-9_]+]]:
253; GCN-NOT:  v_accvgpr
254; GFX908_A: v_mfma_f32_32x32x1f32
255; GFX940:   v_mfma_f32_32x32x1_2b_f32
256; GCN-NOT:  v_accvgpr
257; GCN:      s_cbranch_scc1 [[LOOP]]
258
259; GFX908-COUNT-32: v_accvgpr_read_b32
260; GFX90A-NOT:      v_accvgpr_read_b32
261; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
262; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
263
264define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
265entry:
266  %tid = call i32 @llvm.amdgcn.workitem.id.x()
267  %init = bitcast i32 %tid to float
268  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
269  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
270  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
271  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
272  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
273  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
274  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
275  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
276  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
277  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
278  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
279  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
280  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
281  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
282  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
283  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
284  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
285  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
286  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
287  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
288  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
289  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
290  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
291  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
292  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
293  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
294  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
295  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
296  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
297  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
298  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
299  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
300
301  br label %for.cond.preheader
302
303for.cond.preheader:
304  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
305  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
306  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
307  %inc = add nuw nsw i32 %c, 1
308  %cc = icmp eq i32 %inc, 16
309  br i1 %cc, label %exit, label %for.cond.preheader
310
311exit:
312  store <32 x float> %mai.1, ptr addrspace(1) %arg
313  ret void
314}
315
316; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
317
318; GFX908:          v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
319; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
320; GFX940_A-COUNT-32:        v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}}
321
322; GCN: [[LOOP:.LBB[0-9_]+]]:
323; GCN-NOT:  v_accvgpr
324; GFX908_A: v_mfma_f32_32x32x1f32
325; GFX940:   v_mfma_f32_32x32x1_2b_f32
326; GCN-NOT:  v_accvgpr
327; GCN:      s_cbranch_scc1 [[LOOP]]
328
329; GFX908-COUNT-32: v_accvgpr_read_b32
330; GFX90A-NOT:      v_accvgpr_read_b32
331; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
332; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
333
334define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float %init) #0 {
335entry:
336  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
337  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
338  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
339  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
340  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
341  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
342  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
343  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
344  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
345  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
346  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
347  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
348  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
349  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
350  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
351  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
352  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
353  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
354  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
355  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
356  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
357  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
358  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
359  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
360  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
361  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
362  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
363  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
364  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
365  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
366  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
367  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
368
369  br label %for.cond.preheader
370
371for.cond.preheader:
372  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
373  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
374  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
375  %inc = add nuw nsw i32 %c, 1
376  %cc = icmp eq i32 %inc, 16
377  br i1 %cc, label %exit, label %for.cond.preheader
378
379exit:
380  store <32 x float> %mai.1, ptr addrspace(1) %arg
381  ret void
382}
383
384; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
385
386; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v0
387; GFX908-DAG:   v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
388; GFX940_A-DAG: s_load_dword [[TMP:s[0-9]+]],
389; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
390; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
391; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
392; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
393; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
394; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
395; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
396; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
397; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
398; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
399; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
400; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
401; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
402; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
403; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
404; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
405; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
406; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
407; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
408; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
409; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
410; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
411; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
412; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
413; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
414; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
415; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
416; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
417; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
418; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
419; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
420
421; GFX90A-DAG:      v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
422; GFX90A-COUNT-28: v_accvgpr_write_b32 a{{[0-9]+}}, 0
423
424; GCN: [[LOOP:.LBB[0-9_]+]]:
425; GCN-NOT:  v_accvgpr
426; GFX908_A: v_mfma_f32_32x32x1f32
427; GFX940:   v_mfma_f32_32x32x1_2b_f32
428; GCN-NOT:  v_accvgpr
429; GCN:      s_cbranch_scc1 [[LOOP]]
430
431; GFX908-COUNT-32: v_accvgpr_read_b32
432; GFX90A-NOT:      v_accvgpr_read_b32
433; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
434; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
435
436define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, float %x) #0 {
437entry:
438  %tid = call i32 @llvm.amdgcn.workitem.id.x()
439  %init = bitcast i32 %tid to float
440  %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
441  %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
442
443  br label %for.cond.preheader
444
445for.cond.preheader:
446  %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
447  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
448  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
449  %inc = add nuw nsw i32 %c, 1
450  %cc = icmp eq i32 %inc, 16
451  br i1 %cc, label %exit, label %for.cond.preheader
452
453exit:
454  store <32 x float> %mai.1, ptr addrspace(1) %arg
455  ret void
456}
457
458; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
459
460; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
461; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
462; GFX90A-NOT:      v_accvgpr
463; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
464; GFX90A-NOT:      v_accvgpr
465; GFX940:          v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
466; GCN-NOT:         v_accvgpr
467
468; GCN: [[LOOP:.LBB[0-9_]+]]:
469; GCN-NOT:  v_accvgpr
470; GFX908_A: v_mfma_f32_32x32x1f32
471; GFX940:   v_mfma_f32_32x32x1_2b_f32
472; GCN-NOT:  v_accvgpr
473; GCN:      s_cbranch_scc1 [[LOOP]]
474
475; GFX908-COUNT-32: v_accvgpr_read_b32
476; GFX90A-NOT:      v_accvgpr_read_b32
477; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
478; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
479
480define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %arg) #0 {
481entry:
482  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
483
484  br label %for.cond.preheader
485
486for.cond.preheader:
487  %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
488  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
489  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
490  %inc = add nuw nsw i32 %c, 1
491  %cc = icmp eq i32 %inc, 16
492  br i1 %cc, label %exit, label %for.cond.preheader
493
494exit:
495  store <32 x float> %mai.1, ptr addrspace(1) %arg
496  ret void
497}
498
499; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
500
501; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
502; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
503; GFX90A-NOT:      v_accvgpr
504; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
505; GFX90A-NOT:      v_accvgpr
506; GFX940:          v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
507
508; Check that we are using only one tmp VGPR.
509
510; GFX908:             v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
511; GFX940_A-COUNT-31:  v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
512
513; GCN: [[LOOP:.LBB[0-9_]+]]:
514; GCN-NOT:  v_accvgpr
515; GFX908_A: v_mfma_f32_32x32x1f32
516; GFX940:   v_mfma_f32_32x32x1_2b_f32
517; GCN-NOT:  v_accvgpr
518; GCN:      s_cbranch_scc1 [[LOOP]]
519
520; GFX908-COUNT-32: v_accvgpr_read_b32
521; GFX90A-NOT:      v_accvgpr_read_b32
522; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
523; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
524
525define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
526entry:
527  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
528  %init = extractelement <32 x float> %mai.0, i32 0
529  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
530  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
531  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
532  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
533  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
534  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
535  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
536  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
537  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
538  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
539  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
540  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
541  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
542  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
543  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
544  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
545  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
546  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
547  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
548  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
549  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
550  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
551  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
552  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
553  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
554  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
555  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
556  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
557  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
558  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
559  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
560  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
561
562  br label %for.cond.preheader
563
564for.cond.preheader:
565  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
566  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
567  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
568  %inc = add nuw nsw i32 %c, 1
569  %cc = icmp eq i32 %inc, 16
570  br i1 %cc, label %exit, label %for.cond.preheader
571
572exit:
573  store <32 x float> %mai.1, ptr addrspace(1) %arg
574  ret void
575}
576
577; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
578
579; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
580; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
581; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
582
583; Check that we do not copy agprs to vgprs and back in an outer loop.
584
585; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]:
586; GCN-NOT:  v_accvgpr
587; GCN: [[INNER_LOOP:.LBB[0-9_]+]]:
588; GCN-NOT:  v_accvgpr
589; GFX908_A: v_mfma_f32_32x32x1f32
590; GFX940:   v_mfma_f32_32x32x1_2b_f32
591; GCN-NOT:  v_accvgpr
592; GCN:      s_cbranch_scc1 [[INNER_LOOP]]
593; GCN-NOT:  v_accvgpr
594; GCN:      s_cbranch_scc1 [[OUTER_LOOP]]
595
596; Final result should be read only once after the loop.
597
598; GFX908-COUNT-32: v_accvgpr_read_b32
599; GFX90A-NOT:      v_accvgpr_read_b32
600; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
601; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
602
603define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) #0 {
604entry:
605  br label %for.cond.preheader
606
607for.cond.preheader:
608  %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
609  %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
610  br label %inner.for.cond.preheader
611
612inner.for.cond.preheader:
613  %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
614  %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
615  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
616  %inc = add nuw nsw i32 %c, 1
617  %cc = icmp eq i32 %inc, 16
618  br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
619
620inner.exit:
621  %inc.0 = add nuw nsw i32 %c.0, 1
622  %cc.0 = icmp eq i32 %inc.0, 16
623  br i1 %cc.0, label %exit, label %for.cond.preheader
624
625exit:
626  store <32 x float> %mai.1, ptr addrspace(1) %arg
627  ret void
628}
629
630declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
631declare i32 @llvm.amdgcn.workitem.id.x()
632
633attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
634