xref: /llvm-project/llvm/test/CodeGen/AMDGPU/waitcnt.mir (revision 4f90e75bdc156d2630da525eb74d00611753c706)
1# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-insert-waitcnts  %s -o - | FileCheck -check-prefixes=CHECK,GFX89 %s
2# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts  %s -o - | FileCheck -check-prefixes=CHECK,GFX89 %s
3
4--- |
5  define amdgpu_kernel void @flat_zero_waitcnt(ptr addrspace(1) %global4,
6                                 ptr addrspace(1) %global16,
7                                 ptr %flat4,
8                                 ptr %flat16) {
9    ret void
10  }
11
12  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
13    ret void
14  }
15
16  define amdgpu_kernel void @single_branch_successor_not_next_block() {
17    ret void
18  }
19
20  define amdgpu_kernel void @preexisting_waitcnt() {
21    ret void
22  }
23
24  define amdgpu_kernel void @bundle_no_waitcnt() {
25    ret void
26  }
27
28  define amdgpu_kernel void @preexisting_waitcnt_in_bundle() {
29    ret void
30  }
31
32  define amdgpu_kernel void @insert_in_bundle() {
33    ret void
34  }
35
36  define amdgpu_kernel void @exit_bundle() {
37    ret void
38  }
39
40  define amdgpu_kernel void @cross_bundle() {
41    ret void
42  }
43
44  define amdgpu_kernel void @subregs16bit() {
45    ret void
46  }
47
48  define amdgpu_kernel void @waitcnt_backedge() {
49    ret void
50  }
51...
52---
53
54# CHECK-LABEL: name: flat_zero_waitcnt
55
56# CHECK-LABEL: bb.0:
57# CHECK: FLAT_LOAD_DWORD
58# CHECK: FLAT_LOAD_DWORDX4
59# Global loads will return in order so we should:
60# s_waitcnt vmcnt(1)
61# CHECK-NEXT: S_WAITCNT 3953
62
63# CHECK-LABEL: bb.1:
64# CHECK: FLAT_LOAD_DWORD
65# s_waitcnt vmcnt(0)
66# GFX89: S_WAITCNT 3952
67# CHECK: FLAT_LOAD_DWORDX4
68
69# CHECK-LABEL: bb.2:
70# CHECK: FLAT_LOAD_DWORD
71# s_waitcnt vmcnt(0)
72# GFX89: S_WAITCNT 3952
73# CHECK: FLAT_LOAD_DWORDX4
74
75# CHECK-LABEL: bb.3:
76# s_waitcnt vmcnt(0)
77# GFX89: S_WAITCNT 3952
78# CHECK: FLAT_LOAD_DWORD
79# CHECK: FLAT_LOAD_DWORD
80# s_waitcnt vmcnt(0) lgkmcnt(0)
81# GFX89: S_WAITCNT 112
82
83# CHECK-LABEL: bb.4:
84# GFX89-NOT: S_WAITCNT
85# CHECK: FLAT_LOAD_DWORD
86# s_waitcnt vmcnt(0) lgkmcnt(0)
87# GFX89: S_WAITCNT 112
88
89name: flat_zero_waitcnt
90
91body: |
92  bb.0:
93    successors: %bb.1
94    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.global4)
95    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %ir.global16)
96    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
97    S_BRANCH %bb.1
98
99  bb.1:
100    successors: %bb.2
101    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
102    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %ir.global16)
103    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
104    S_BRANCH %bb.2
105
106  bb.2:
107    successors: %bb.3
108    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.flat4)
109    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %ir.flat16)
110    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
111    S_BRANCH %bb.3
112
113  bb.3:
114    successors: %bb.4
115    $vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.flat4)
116    $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.global4)
117    $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec
118    S_BRANCH %bb.4
119
120  bb.4:
121    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.flat4)
122    $vgpr0 = V_MOV_B32_e32 $vgpr5, implicit $exec
123    S_ENDPGM 0
124...
125---
126# There is only a single fallthrough successor block, so there's no
127# need to wait immediately.
128
129# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait
130# CHECK:   $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2
131# CHECK-NOT: S_WAITCNT
132
133# CHECK: bb.1:
134# CHECK-NEXT: V_LSHLREV_B64_e64
135# CHECK-NEXT: S_WAITCNT 112
136# CHECK-NEXT: FLAT_STORE_DWORD
137name: single_fallthrough_successor_no_end_block_wait
138
139body: |
140  bb.0:
141    successors: %bb.1
142    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
143
144  bb.1:
145    $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec
146    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
147    S_ENDPGM 0
148...
149---
150# The block has a single predecessor with a single successor, but it
151# is not the next block so it's non-obvious that the wait is not needed.
152
153
154# CHECK-LABEL: name: single_branch_successor_not_next_block
155
156# CHECK: bb.1
157# CHECK-NEXT: FLAT_STORE_DWORD
158# CHECK-NEXT: S_ENDPGM 0
159
160# CHECK: bb.2:
161# CHECK-NEXT: V_LSHLREV_B64_e64
162# CHECK-NEXT: S_WAITCNT 112
163# CHECK-NEXT: FLAT_STORE_DWORD
164name: single_branch_successor_not_next_block
165
166body: |
167  bb.0:
168    successors: %bb.2
169    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
170   S_BRANCH %bb.2
171
172  bb.1:
173    FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
174    S_ENDPGM 0
175
176  bb.2:
177     $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec
178    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
179    S_ENDPGM 0
180...
181
182# CHECK-LABEL: name: preexisting_waitcnt{{$}}
183# CHECK: FLAT_LOAD_DWORD
184# CHECK-NEXT: S_WAITCNT 0
185# CHECK-NOT: S_WAITCNT
186name: preexisting_waitcnt
187tracksRegLiveness: true
188machineFunctionInfo:
189  isEntryFunction: true
190body: |
191  bb.0:
192    liveins: $vgpr1_vgpr2
193    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
194    S_WAITCNT 0
195    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
196
197...
198
199---
200
201# CHECK-LABEL: name: bundle_no_waitcnt{{$}}
202# CHECK: FLAT_LOAD_DWORD
203# CHECK-NEXT: BUNDLE
204# CHECK-NEXT: S_NOP
205# CHECK-NEXT: S_NOP
206# CHECK-NEXT: }
207# CHECK-NEXT: S_WAITCNT 112
208name: bundle_no_waitcnt
209tracksRegLiveness: true
210machineFunctionInfo:
211  isEntryFunction: true
212body: |
213  bb.0:
214    liveins: $vgpr1_vgpr2
215    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
216    BUNDLE {
217      S_NOP 0
218      S_NOP 0
219    }
220    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
221
222...
223
224---
225
226# See the waitcnt inside the bundle and don't insert an extra
227# CHECK-LABEL: name: preexisting_waitcnt_in_bundle{{$}}
228# CHECK: FLAT_LOAD_DWORD
229# CHECK: S_WAITCNT 0
230# CHECK-NOT: S_WAITCNT
231name: preexisting_waitcnt_in_bundle
232tracksRegLiveness: true
233machineFunctionInfo:
234  isEntryFunction: true
235body: |
236  bb.0:
237    liveins: $vgpr1_vgpr2
238    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
239    BUNDLE {
240      S_NOP 0
241      S_WAITCNT 0
242    }
243    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
244
245...
246
247---
248
249# Def and use inside bundle
250# CHECK-LABEL: name: insert_in_bundle{{$}}
251# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
252# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
253# CHECK-NEXT: S_WAITCNT 112
254# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
255# CHECK-NEXT: }
256
257name: insert_in_bundle
258tracksRegLiveness: true
259machineFunctionInfo:
260  isEntryFunction: true
261body: |
262  bb.0:
263    liveins: $vgpr1_vgpr2
264    BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
265    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
266    FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
267    }
268...
269
270---
271
272# Def is last instruction in bundle, use is outside bundle
273
274# CHECK-LABEL: name: exit_bundle{{$}}
275# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
276# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
277# CHECK-NEXT: }
278# CHECK-NEXT: S_WAITCNT 112
279# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
280
281name: exit_bundle
282tracksRegLiveness: true
283machineFunctionInfo:
284  isEntryFunction: true
285body: |
286  bb.0:
287    liveins: $vgpr1_vgpr2
288    BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
289    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
290    }
291
292    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
293
294...
295
296---
297
298# Def is in bundle, use is in another bundle
299
300# CHECK-LABEL: name: cross_bundle{{$}}
301# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
302# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
303# CHECK-NEXT: }
304# CHECK-NEXT: BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 {
305# CHECK-NEXT: S_WAITCNT 112
306# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
307# CHECK-NEXT: }
308
309name: cross_bundle
310tracksRegLiveness: true
311machineFunctionInfo:
312  isEntryFunction: true
313body: |
314  bb.0:
315    liveins: $vgpr1_vgpr2
316    BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
317    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
318    }
319    BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 {
320      FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
321    }
322...
323
324---
325# CHECK-LABEL: name: subregs16bit
326# CHECK: S_WAITCNT 112
327# CHECK-NEXT: V_NOP_e32
328
329name: subregs16bit
330machineFunctionInfo:
331  isEntryFunction: true
332body: |
333  bb.0:
334    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4
335      $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
336      $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr
337      V_NOP_e32 implicit $exec, implicit $vgpr0_lo16, implicit $vgpr1_lo16
338...
339
340---
341# Waitcnt required before the use of $sgpr10_sgpr11, as the S_LOAD also writes
342# to $sgpr10_sgpr11, and can occur first in the program running order.
343
344# CHECK-LABEL: name: waitcnt_backedge
345# CHECK: S_WAITCNT
346# CHECK: $sgpr10_sgpr11 = S_CSELECT_B64
347# CHECK: $sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM
348
349
350name: waitcnt_backedge
351body: |
352  bb.0:
353    renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr2_sgpr3, 32, 0 :: (load (s128) from `ptr addrspace(4) undef`, addrspace 4)
354
355  bb.4:
356    renamable $sgpr10_sgpr11 = S_CSELECT_B64 -1, 0, implicit killed $scc
357    renamable $vgpr1 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr5, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
358    renamable $sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s64) from `ptr addrspace(4) undef`, align 4, addrspace 4)
359    S_CBRANCH_SCC0 %bb.9, implicit killed $scc
360
361  bb.9:
362    renamable $vgpr1 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr14_sgpr15, implicit $exec
363    S_CBRANCH_SCC0 %bb.14, implicit killed $scc
364
365  bb.10:
366    S_BRANCH %bb.4
367
368  bb.14:
369    S_ENDPGM 0
370...
371