xref: /llvm-project/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir (revision a82032918cd445e5750e171f57d4f3d7096c021a)
1# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
2
3---
4name: valu_dep_1
5body: |
6  bb.0:
7    ; CHECK-LABEL: {{^}}valu_dep_1:
8    ; CHECK: %bb.0:
9    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
10    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
11    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
12    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
13    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
14...
15
16---
17name: valu_dep_2
18body: |
19  bb.0:
20    ; CHECK-LABEL: {{^}}valu_dep_2:
21    ; CHECK: %bb.0:
22    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
23    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
24    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
25    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
26    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
27    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
28    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
29...
30
31---
32name: valu_dep_3
33body: |
34  bb.0:
35    ; CHECK-LABEL: {{^}}valu_dep_3:
36    ; CHECK: %bb.0:
37    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
38    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
39    ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
40    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
41    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
42    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
43    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
44    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
45    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
46...
47
48---
49name: valu_dep_4
50body: |
51  bb.0:
52    ; CHECK-LABEL: {{^}}valu_dep_4:
53    ; CHECK: %bb.0:
54    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
55    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
56    ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
57    ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
58    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
59    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
60    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
61    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
62    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
63    $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
64    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
65...
66
67# There's no encoding for VALU_DEP_5. A normal VALU instruction will have
68# completed already.
69---
70name: valu_dep_5
71body: |
72  bb.0:
73    ; CHECK-LABEL: {{^}}valu_dep_5:
74    ; CHECK: %bb.0:
75    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
76    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
77    ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
78    ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
79    ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4
80    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
81    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
82    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
83    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
84    $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
85    $vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec
86    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
87...
88
89---
90name: trans32_dep_1
91body: |
92  bb.0:
93    ; CHECK-LABEL: {{^}}trans32_dep_1:
94    ; CHECK: %bb.0:
95    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
96    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
97    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
98    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
99    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
100...
101
102---
103name: trans32_dep_2
104body: |
105  bb.0:
106    ; CHECK-LABEL: {{^}}trans32_dep_2:
107    ; CHECK: %bb.0:
108    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
109    ; CHECK-NEXT: v_exp_f32_e32 v1, v1
110    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
111    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
112    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
113    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
114    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
115...
116
117---
118name: trans32_dep_3
119body: |
120  bb.0:
121    ; CHECK-LABEL: {{^}}trans32_dep_3:
122    ; CHECK: %bb.0:
123    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
124    ; CHECK-NEXT: v_exp_f32_e32 v1, v1
125    ; CHECK-NEXT: v_exp_f32_e32 v2, v2
126    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
127    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
128    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
129    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
130    $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
131    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
132...
133
134# There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have
135# completed already.
136---
137name: trans32_dep_4
138body: |
139  bb.0:
140    ; CHECK-LABEL: {{^}}trans32_dep_4:
141    ; CHECK: %bb.0:
142    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
143    ; CHECK-NEXT: v_exp_f32_e32 v1, v1
144    ; CHECK-NEXT: v_exp_f32_e32 v2, v2
145    ; CHECK-NEXT: v_exp_f32_e32 v3, v3
146    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
147    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
148    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
149    $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
150    $vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode
151    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
152...
153
154---
155name: salu_cycle_1
156body: |
157  bb.0:
158    ; CHECK-LABEL: {{^}}salu_cycle_1:
159    ; CHECK: %bb.0:
160    ; CHECK-NEXT: s_mov_b32 s0, 0
161    ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
162    ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
163    $sgpr0 = S_MOV_B32 0
164    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
165...
166
167# There's no need for SALU_CYCLE_2 here because the s_mov will have completed
168# already.
169---
170name: salu_cycle_2
171body: |
172  bb.0:
173    ; CHECK-LABEL: {{^}}salu_cycle_2:
174    ; CHECK: %bb.0:
175    ; CHECK-NEXT: s_mov_b32 s0, 0
176    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
177    ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
178    $sgpr0 = S_MOV_B32 0
179    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
180    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
181...
182
183---
184name: valu_dep_1_same_trans32_dep_1
185body: |
186  bb.0:
187    ; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1:
188    ; CHECK: %bb.0:
189    ; CHECK-NEXT: v_exp_f32_e32 v0, v0
190    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
191    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
192    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
193    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
194    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
195    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
196...
197
198# There's no need to encode the VALU depdendency because it will complete before
199# the TRANS.
200---
201name: trans32_dep_1_only
202body: |
203  bb.0:
204    ; CHECK-LABEL: {{^}}trans32_dep_1_only:
205    ; CHECK: %bb.0:
206    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
207    ; CHECK-NEXT: v_exp_f32_e32 v1, v1
208    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
209    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
210    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
211    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
212    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
213...
214
215---
216name: valu_dep_1_same_salu_cycle_1
217body: |
218  bb.0:
219    ; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1:
220    ; CHECK: %bb.0:
221    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
222    ; CHECK-NEXT: s_mov_b32 s0, 0
223    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
224    ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
225    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
226    $sgpr0 = S_MOV_B32 0
227    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
228...
229
230---
231name: valu_dep_1_next_valu_dep_1
232body: |
233  bb.0:
234    ; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1:
235    ; CHECK: %bb.0:
236    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
237    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
238    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
239    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
240    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
241    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
242    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
243...
244
245---
246name: valu_dep_2_next_valu_dep_2
247body: |
248  bb.0:
249    ; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2:
250    ; CHECK: %bb.0:
251    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
252    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
253    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
254    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
255    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
256    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
257    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
258    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
259    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
260...
261
262# There's no need to encode a dependency for the second mul, because the
263# dependency for the first mul has already guaranteed that the add has
264# completed.
265---
266name: valu_dep_1_no_next_1
267body: |
268  bb.0:
269    ; CHECK-LABEL: {{^}}valu_dep_1_no_next_1:
270    ; CHECK: %bb.0:
271    ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
272    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
273    ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0
274    ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0
275    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
276    $vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
277    $vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
278...
279
280# There's no need to encode a dependency for the second add, because the
281# dependency for the second mul has already guaranteed that a later VALU has
282# completed.
283---
284name: valu_dep_1_no_next_2
285body: |
286  bb.0:
287    ; CHECK-LABEL: {{^}}valu_dep_1_no_next_2:
288    ; CHECK: %bb.0:
289    ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
290    ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
291    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
292    ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
293    ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
294    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
295    $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
296    $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
297    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
298...
299
300# There are no wait states between an add/sub/cmp generating carry and an
301# add/sub/cndmask that consumes it, so no need to encode a dependency.
302
303---
304name: implicit_cmp_cndmask
305body: |
306  bb.0:
307    ; CHECK-LABEL: {{^}}implicit_cmp_cndmask:
308    ; CHECK: %bb.0:
309    ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1
310    ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc
311    implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
312    $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec
313...
314
315# TODO: There should be no s_delay_alu here.
316---
317name: explicit_cmp_cndmask
318body: |
319  bb.0:
320    ; CHECK-LABEL: {{^}}explicit_cmp_cndmask:
321    ; CHECK: %bb.0:
322    ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1
323    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
324    ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
325    $sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec
326    $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec
327...
328
329---
330name: implicit_addc_addc
331body: |
332  bb.0:
333    ; CHECK-LABEL: {{^}}implicit_addc_addc:
334    ; CHECK: %bb.0:
335    ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
336    ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
337    $vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
338    $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
339...
340
341---
342name: explicit_addc_addc
343body: |
344  bb.0:
345    ; CHECK-LABEL: {{^}}explicit_addc_addc:
346    ; CHECK: %bb.0:
347    ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0
348    ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
349    $vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
350    $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
351...
352
353---
354name: valu_dep_3_bundle
355body: |
356  bb.0:
357    ; CHECK-LABEL: {{^}}valu_dep_3_bundle:
358    ; CHECK: %bb.0:
359    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
360    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
361    ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
362    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
363    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
364    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
365    BUNDLE {
366      $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
367      $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
368    }
369    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
370...
371
372---
373name: if
374body: |
375  bb.0:
376    ; CHECK-LABEL: {{^}}if:
377    ; CHECK: %bb.0:
378    ; CHECK-NEXT: s_cbranch_vccz .LBB23_2
379    ; CHECK-NEXT: %bb.1:
380    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
381    ; CHECK-NEXT: .LBB23_2:
382    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
383    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
384    S_CBRANCH_VCCZ %bb.2, implicit $vcc
385  bb.1:
386    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
387  bb.2:
388    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
389...
390
391---
392name: else
393body: |
394  bb.0:
395    ; CHECK-LABEL: {{^}}else:
396    ; CHECK: %bb.0:
397    ; CHECK-NEXT: s_cbranch_vccz .LBB24_2
398    ; CHECK-NEXT: %bb.1
399    ; CHECK-NEXT: s_branch .LBB24_3
400    ; CHECK-NEXT: .LBB24_2:
401    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
402    ; CHECK-NEXT: .LBB24_3:
403    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
404    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
405    S_CBRANCH_VCCZ %bb.2, implicit $vcc
406  bb.1:
407    S_BRANCH %bb.3
408  bb.2:
409    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
410  bb.3:
411    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
412...
413
414---
415name: if_else
416body: |
417  bb.0:
418    ; CHECK-LABEL: {{^}}if_else:
419    ; CHECK: %bb.0:
420    ; CHECK-NEXT: s_cbranch_vccz .LBB25_2
421    ; CHECK-NEXT: %bb.1:
422    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
423    ; CHECK-NEXT: s_branch .LBB25_3
424    ; CHECK-NEXT: .LBB25_2:
425    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
426    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1
427    ; CHECK-NEXT: .LBB25_3:
428    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
429    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
430    S_CBRANCH_VCCZ %bb.2, implicit $vcc
431  bb.1:
432    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
433    S_BRANCH %bb.3
434  bb.2:
435    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
436    $vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
437  bb.3:
438    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
439...
440
441# Dependency from outside the loop.
442---
443name: loop_1
444body: |
445  bb.0:
446    ; CHECK-LABEL: {{^}}loop_1:
447    ; CHECK: %bb.0:
448    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
449    ; CHECK-NEXT: .LBB26_1:
450    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
451    ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0
452    ; CHECK-NEXT: s_cbranch_vccz .LBB26_1
453    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
454  bb.1:
455    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
456    S_CBRANCH_VCCZ %bb.1, implicit $vcc
457  bb.2:
458...
459
460# Dependency from inside the loop.
461---
462name: loop_2
463body: |
464  bb.0:
465    ; CHECK-LABEL: {{^}}loop_2:
466    ; CHECK: %bb.0:
467    ; CHECK-NEXT: .LBB27_1:
468    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
469    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
470    ; CHECK-NEXT: s_cbranch_vccz .LBB27_1
471  bb.1:
472    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
473    S_CBRANCH_VCCZ %bb.1, implicit $vcc
474  bb.2:
475...
476
477# No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU
478# to complete.
479---
480name: sendmsg_rtn
481body: |
482  bb.0:
483    ; CHECK-LABEL: {{^}}sendmsg_rtn:
484    ; CHECK: %bb.0:
485    ; CHECK-NEXT: v_mov_b32_e32 v0, 0
486    ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
487    ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
488    ; CHECK-NEXT: s_add_u32 s0, s0, s0
489    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
490    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
491    $sgpr0 = S_SENDMSG_RTN_B32 128
492    $sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc
493    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
494...
495
496# No VALU delay before or across FLAT because it waits for all outstanding VALU
497# to complete.
498---
499name: flat_load
500body: |
501  bb.0:
502    ; CHECK-LABEL: {{^}}flat_load:
503    ; CHECK: %bb.0:
504    ; CHECK-NEXT: v_mov_b32_e32 v0, 0
505    ; CHECK-NEXT: v_mov_b32_e32 v1, 0
506    ; CHECK-NEXT: v_mov_b32_e32 v2, 0
507    ; CHECK-NEXT: flat_load_b32 v0, v[0:1]
508    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2
509    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
510    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
511    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
512    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
513    $vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
514...
515
516# No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU
517# to complete.
518---
519name: waitcnt_depctr
520body: |
521  bb.0:
522    ; CHECK-LABEL: {{^}}waitcnt_depctr:
523    ; CHECK: %bb.0:
524    ; CHECK-NEXT: v_mov_b32_e32 v0, 0
525    ; CHECK-NEXT: s_waitcnt_depctr 0xfff
526    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
527    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
528    S_WAITCNT_DEPCTR 4095
529    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
530...
531
532# Check that no delays are emitted for writelane instructions.
533---
534name: writelane1
535body: |
536  bb.0:
537    ; CHECK-LABEL: {{^}}writelane1:
538    ; CHECK: %bb.0:
539    ; CHECK-NEXT: v_writelane_b32 v0, s0, 0
540    ; CHECK-NEXT: v_writelane_b32 v0, s0, 1
541    ; CHECK-NEXT: v_writelane_b32 v0, s0, 2
542    ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
543    $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0
544    $vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0
545    $vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0
546    $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
547...
548
549# Check if a VALU delay is added after writelane.
550---
551name: writelane2
552body: |
553  bb.0:
554    ; CHECK-LABEL: {{^}}writelane2:
555    ; CHECK: %bb.0:
556    ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
557    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
558    ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
559    $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
560    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
561...
562