xref: /llvm-project/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
3
4; Load argument depends on waitcnt which should be skipped.
5define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
6; GCN-LABEL: call_memory_arg_load:
7; GCN:       ; %bb.0:
8; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
9; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
10; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
11; GCN-NEXT:    s_add_u32 s0, s0, s11
12; GCN-NEXT:    s_addc_u32 s1, s1, 0
13; GCN-NEXT:    s_waitcnt lgkmcnt(0)
14; GCN-NEXT:    v_mov_b32_e32 v0, s6
15; GCN-NEXT:    ds_read_b32 v0, v0
16; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
17; GCN-NEXT:    s_mov_b32 s32, 0
18; GCN-NEXT:    s_getpc_b64 s[8:9]
19; GCN-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
20; GCN-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
21; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
22; GCN-NEXT:    s_endpgm
23  %vgpr = load volatile i32, ptr addrspace(3) %ptr
24  call void @func(i32 %vgpr)
25  ret void
26}
27
28; Memory waitcnt with no register dependence on the call
29define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
30; GCN-LABEL: call_memory_no_dep:
31; GCN:       ; %bb.0:
32; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
33; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
34; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
35; GCN-NEXT:    s_add_u32 s0, s0, s11
36; GCN-NEXT:    v_mov_b32_e32 v0, 0
37; GCN-NEXT:    s_addc_u32 s1, s1, 0
38; GCN-NEXT:    s_waitcnt lgkmcnt(0)
39; GCN-NEXT:    global_store_dword v0, v0, s[6:7]
40; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
41; GCN-NEXT:    v_mov_b32_e32 v0, 0
42; GCN-NEXT:    s_mov_b32 s32, 0
43; GCN-NEXT:    s_getpc_b64 s[8:9]
44; GCN-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
45; GCN-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
46; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
47; GCN-NEXT:    s_endpgm
48  store i32 0, ptr addrspace(1) %ptr
49  call void @func(i32 0)
50  ret void
51}
52
53; Should not wait after the call before memory
54define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
55; GCN-LABEL: call_no_wait_after_call:
56; GCN:       ; %bb.0:
57; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
58; GCN-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
59; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
60; GCN-NEXT:    s_add_u32 s0, s0, s11
61; GCN-NEXT:    s_addc_u32 s1, s1, 0
62; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
63; GCN-NEXT:    v_mov_b32_e32 v0, 0
64; GCN-NEXT:    s_mov_b32 s32, 0
65; GCN-NEXT:    s_getpc_b64 s[8:9]
66; GCN-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
67; GCN-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
68; GCN-NEXT:    v_mov_b32_e32 v40, 0
69; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
70; GCN-NEXT:    global_store_dword v40, v40, s[34:35]
71; GCN-NEXT:    s_endpgm
72  call void @func(i32 0)
73  store i32 0, ptr addrspace(1) %ptr
74  ret void
75}
76
77define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
78; GCN-LABEL: call_no_wait_after_call_return_val:
79; GCN:       ; %bb.0:
80; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
81; GCN-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
82; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
83; GCN-NEXT:    s_add_u32 s0, s0, s11
84; GCN-NEXT:    s_addc_u32 s1, s1, 0
85; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
86; GCN-NEXT:    v_mov_b32_e32 v0, 0
87; GCN-NEXT:    s_mov_b32 s32, 0
88; GCN-NEXT:    s_getpc_b64 s[8:9]
89; GCN-NEXT:    s_add_u32 s8, s8, func.return@rel32@lo+4
90; GCN-NEXT:    s_addc_u32 s9, s9, func.return@rel32@hi+12
91; GCN-NEXT:    v_mov_b32_e32 v40, 0
92; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
93; GCN-NEXT:    global_store_dword v40, v0, s[34:35]
94; GCN-NEXT:    s_endpgm
95  %rv = call i32 @func.return(i32 0)
96  store i32 %rv, ptr addrspace(1) %ptr
97  ret void
98}
99
100; Need to wait for the address dependency
101define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
102; GCN-LABEL: call_got_load:
103; GCN:       ; %bb.0:
104; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
105; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
106; GCN-NEXT:    s_add_u32 s0, s0, s11
107; GCN-NEXT:    s_addc_u32 s1, s1, 0
108; GCN-NEXT:    s_getpc_b64 s[6:7]
109; GCN-NEXT:    s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
110; GCN-NEXT:    s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
111; GCN-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
112; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
113; GCN-NEXT:    v_mov_b32_e32 v0, 0
114; GCN-NEXT:    s_mov_b32 s32, 0
115; GCN-NEXT:    s_waitcnt lgkmcnt(0)
116; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
117; GCN-NEXT:    s_endpgm
118  call void @got.func(i32 0)
119  ret void
120}
121
122; Need to wait for the address dependency
123define void @tailcall_got_load(ptr addrspace(1) %ptr, i32) #0 {
124; GCN-LABEL: tailcall_got_load:
125; GCN:       ; %bb.0:
126; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127; GCN-NEXT:    s_getpc_b64 s[4:5]
128; GCN-NEXT:    s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
129; GCN-NEXT:    s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
130; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
131; GCN-NEXT:    v_mov_b32_e32 v0, 0
132; GCN-NEXT:    s_waitcnt lgkmcnt(0)
133; GCN-NEXT:    s_setpc_b64 s[4:5]
134  tail call void @got.func(i32 0)
135  ret void
136}
137
138; No need to wait for the load.
139define void @tail_call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
140; GCN-LABEL: tail_call_memory_arg_load:
141; GCN:       ; %bb.0:
142; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143; GCN-NEXT:    ds_read_b32 v0, v0
144; GCN-NEXT:    s_getpc_b64 s[4:5]
145; GCN-NEXT:    s_add_u32 s4, s4, func@rel32@lo+4
146; GCN-NEXT:    s_addc_u32 s5, s5, func@rel32@hi+12
147; GCN-NEXT:    s_setpc_b64 s[4:5]
148  %vgpr = load volatile i32, ptr addrspace(3) %ptr
149  tail call void @func(i32 %vgpr)
150  ret void
151}
152
153declare hidden void @func(i32) #0
154declare hidden i32 @func.return(i32) #0
155declare void @got.func(i32) #0
156
157attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
158
159!llvm.module.flags = !{!0}
160!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
161