xref: /llvm-project/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck %s
3
4; FP is in CSR range, modified.
5define hidden fastcc void @callee_has_fp() #1 {
6; CHECK-LABEL: callee_has_fp:
7; CHECK:       ; %bb.0:
8; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; CHECK-NEXT:    s_mov_b32 s4, s33
10; CHECK-NEXT:    s_mov_b32 s33, s32
11; CHECK-NEXT:    s_add_i32 s32, s32, 0x200
12; CHECK-NEXT:    v_mov_b32_e32 v0, 1
13; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s33
14; CHECK-NEXT:    s_waitcnt vmcnt(0)
15; CHECK-NEXT:    s_mov_b32 s32, s33
16; CHECK-NEXT:    s_mov_b32 s33, s4
17; CHECK-NEXT:    s_setpc_b64 s[30:31]
18  %alloca = alloca i32, addrspace(5)
19  store volatile i32 1, ptr addrspace(5) %alloca
20  ret void
21}
22
23; Has no stack objects, but introduces them due to the CSR spill. We
24; see the FP modified in the callee with IPRA. We should not have
25; redundant spills of s33 or assert.
26define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
27; CHECK-LABEL: csr_vgpr_spill_fp_callee:
28; CHECK:       ; %bb.0: ; %bb
29; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; CHECK-NEXT:    s_mov_b32 s18, s33
31; CHECK-NEXT:    s_mov_b32 s33, s32
32; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
33; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
34; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
35; CHECK-NEXT:    s_add_i32 s32, s32, 0x400
36; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
37; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
38; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
39; CHECK-NEXT:    s_getpc_b64 s[16:17]
40; CHECK-NEXT:    s_add_u32 s16, s16, callee_has_fp@rel32@lo+4
41; CHECK-NEXT:    s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12
42; CHECK-NEXT:    s_mov_b64 s[22:23], s[2:3]
43; CHECK-NEXT:    s_mov_b64 s[20:21], s[0:1]
44; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
45; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
46; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
47; CHECK-NEXT:    ;;#ASMSTART
48; CHECK-NEXT:    ; clobber csr v40
49; CHECK-NEXT:    ;;#ASMEND
50; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
51; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
52; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
53; CHECK-NEXT:    s_mov_b32 s32, s33
54; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
55; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
56; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
57; CHECK-NEXT:    s_mov_b32 s33, s18
58; CHECK-NEXT:    s_waitcnt vmcnt(0)
59; CHECK-NEXT:    s_setpc_b64 s[30:31]
60bb:
61  call fastcc void @callee_has_fp()
62  call void asm sideeffect "; clobber csr v40", "~{v40}"()
63  ret void
64}
65
66define amdgpu_kernel void @kernel_call() {
67; CHECK-LABEL: kernel_call:
68; CHECK:       ; %bb.0: ; %bb
69; CHECK-NEXT:    s_mov_b32 s32, 0
70; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
71; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
72; CHECK-NEXT:    s_add_u32 s0, s0, s17
73; CHECK-NEXT:    s_addc_u32 s1, s1, 0
74; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
75; CHECK-NEXT:    v_writelane_b32 v3, s16, 0
76; CHECK-NEXT:    s_mov_b32 s13, s15
77; CHECK-NEXT:    s_mov_b32 s12, s14
78; CHECK-NEXT:    v_readlane_b32 s14, v3, 0
79; CHECK-NEXT:    s_getpc_b64 s[16:17]
80; CHECK-NEXT:    s_add_u32 s16, s16, csr_vgpr_spill_fp_callee@rel32@lo+4
81; CHECK-NEXT:    s_addc_u32 s17, s17, csr_vgpr_spill_fp_callee@rel32@hi+12
82; CHECK-NEXT:    s_mov_b64 s[22:23], s[2:3]
83; CHECK-NEXT:    s_mov_b64 s[20:21], s[0:1]
84; CHECK-NEXT:    s_mov_b32 s15, 20
85; CHECK-NEXT:    v_lshlrev_b32_e64 v2, s15, v2
86; CHECK-NEXT:    s_mov_b32 s15, 10
87; CHECK-NEXT:    v_lshlrev_b32_e64 v1, s15, v1
88; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
89; CHECK-NEXT:    ; implicit-def: $sgpr15
90; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
91; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
92; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
93; CHECK-NEXT:    s_endpgm
94bb:
95  tail call fastcc void @csr_vgpr_spill_fp_callee()
96  ret void
97}
98
99; Same, except with a tail call.
100define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
101; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee:
102; CHECK:       ; %bb.0: ; %bb
103; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
105; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
106; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
107; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
108; CHECK-NEXT:    v_writelane_b32 v1, s33, 0
109; CHECK-NEXT:    ;;#ASMSTART
110; CHECK-NEXT:    ; clobber csr v40
111; CHECK-NEXT:    ;;#ASMEND
112; CHECK-NEXT:    s_getpc_b64 s[16:17]
113; CHECK-NEXT:    s_add_u32 s16, s16, callee_has_fp@rel32@lo+4
114; CHECK-NEXT:    s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12
115; CHECK-NEXT:    v_readlane_b32 s33, v1, 0
116; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
117; CHECK-NEXT:    s_xor_saveexec_b64 s[18:19], -1
118; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
119; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
120; CHECK-NEXT:    s_setpc_b64 s[16:17]
121bb:
122  call void asm sideeffect "; clobber csr v40", "~{v40}"()
123  tail call fastcc void @callee_has_fp()
124  ret void
125}
126
127define amdgpu_kernel void @kernel_tailcall() {
128; CHECK-LABEL: kernel_tailcall:
129; CHECK:       ; %bb.0: ; %bb
130; CHECK-NEXT:    s_mov_b32 s32, 0
131; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
132; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
133; CHECK-NEXT:    s_add_u32 s0, s0, s17
134; CHECK-NEXT:    s_addc_u32 s1, s1, 0
135; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
136; CHECK-NEXT:    v_writelane_b32 v3, s16, 0
137; CHECK-NEXT:    s_mov_b32 s13, s15
138; CHECK-NEXT:    s_mov_b32 s12, s14
139; CHECK-NEXT:    v_readlane_b32 s14, v3, 0
140; CHECK-NEXT:    s_getpc_b64 s[16:17]
141; CHECK-NEXT:    s_add_u32 s16, s16, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
142; CHECK-NEXT:    s_addc_u32 s17, s17, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
143; CHECK-NEXT:    s_mov_b64 s[22:23], s[2:3]
144; CHECK-NEXT:    s_mov_b64 s[20:21], s[0:1]
145; CHECK-NEXT:    s_mov_b32 s15, 20
146; CHECK-NEXT:    v_lshlrev_b32_e64 v2, s15, v2
147; CHECK-NEXT:    s_mov_b32 s15, 10
148; CHECK-NEXT:    v_lshlrev_b32_e64 v1, s15, v1
149; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
150; CHECK-NEXT:    ; implicit-def: $sgpr15
151; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
152; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
153; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
154; CHECK-NEXT:    s_endpgm
155bb:
156  tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
157  ret void
158}
159
160define hidden i32 @tail_call() #1 {
161; CHECK-LABEL: tail_call:
162; CHECK:       ; %bb.0: ; %entry
163; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164; CHECK-NEXT:    s_mov_b32 s4, s33
165; CHECK-NEXT:    s_mov_b32 s33, s32
166; CHECK-NEXT:    v_mov_b32_e32 v0, 0
167; CHECK-NEXT:    s_mov_b32 s33, s4
168; CHECK-NEXT:    s_setpc_b64 s[30:31]
169entry:
170  ret i32 0
171}
172
173define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
174; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call:
175; CHECK:       ; %bb.0: ; %entry
176; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177; CHECK-NEXT:    s_mov_b32 s18, s33
178; CHECK-NEXT:    s_mov_b32 s33, s32
179; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
180; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
181; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
182; CHECK-NEXT:    s_add_i32 s32, s32, 0x400
183; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
184; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
185; CHECK-NEXT:    s_getpc_b64 s[16:17]
186; CHECK-NEXT:    s_add_u32 s16, s16, tail_call@rel32@lo+4
187; CHECK-NEXT:    s_addc_u32 s17, s17, tail_call@rel32@hi+12
188; CHECK-NEXT:    s_mov_b64 s[22:23], s[2:3]
189; CHECK-NEXT:    s_mov_b64 s[20:21], s[0:1]
190; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
191; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
192; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
193; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
194; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
195; CHECK-NEXT:    s_mov_b32 s32, s33
196; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
197; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
198; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
199; CHECK-NEXT:    s_mov_b32 s33, s18
200; CHECK-NEXT:    s_waitcnt vmcnt(0)
201; CHECK-NEXT:    s_setpc_b64 s[30:31]
202entry:
203  %call = call i32 @tail_call()
204  ret i32 %call
205}
206
207define hidden i32 @caller_save_vgpr_spill_fp() #0 {
208; CHECK-LABEL: caller_save_vgpr_spill_fp:
209; CHECK:       ; %bb.0: ; %entry
210; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211; CHECK-NEXT:    s_mov_b32 s19, s33
212; CHECK-NEXT:    s_mov_b32 s33, s32
213; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
214; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
215; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
216; CHECK-NEXT:    s_add_i32 s32, s32, 0x400
217; CHECK-NEXT:    v_writelane_b32 v2, s30, 0
218; CHECK-NEXT:    v_writelane_b32 v2, s31, 1
219; CHECK-NEXT:    s_getpc_b64 s[16:17]
220; CHECK-NEXT:    s_add_u32 s16, s16, caller_save_vgpr_spill_fp_tail_call@rel32@lo+4
221; CHECK-NEXT:    s_addc_u32 s17, s17, caller_save_vgpr_spill_fp_tail_call@rel32@hi+12
222; CHECK-NEXT:    s_mov_b64 s[22:23], s[2:3]
223; CHECK-NEXT:    s_mov_b64 s[20:21], s[0:1]
224; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
225; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
226; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
227; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
228; CHECK-NEXT:    v_readlane_b32 s30, v2, 0
229; CHECK-NEXT:    s_mov_b32 s32, s33
230; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
231; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
232; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
233; CHECK-NEXT:    s_mov_b32 s33, s19
234; CHECK-NEXT:    s_waitcnt vmcnt(0)
235; CHECK-NEXT:    s_setpc_b64 s[30:31]
236entry:
237  %call = call i32 @caller_save_vgpr_spill_fp_tail_call()
238  ret i32 %call
239}
240
241define protected amdgpu_kernel void @kernel() {
242; CHECK-LABEL: kernel:
243; CHECK:       ; %bb.0: ; %entry
244; CHECK-NEXT:    s_mov_b32 s32, 0
245; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
246; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
247; CHECK-NEXT:    s_add_u32 s0, s0, s17
248; CHECK-NEXT:    s_addc_u32 s1, s1, 0
249; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
250; CHECK-NEXT:    v_writelane_b32 v3, s16, 0
251; CHECK-NEXT:    s_mov_b32 s13, s15
252; CHECK-NEXT:    s_mov_b32 s12, s14
253; CHECK-NEXT:    v_readlane_b32 s14, v3, 0
254; CHECK-NEXT:    s_getpc_b64 s[16:17]
255; CHECK-NEXT:    s_add_u32 s16, s16, caller_save_vgpr_spill_fp@rel32@lo+4
256; CHECK-NEXT:    s_addc_u32 s17, s17, caller_save_vgpr_spill_fp@rel32@hi+12
257; CHECK-NEXT:    s_mov_b64 s[22:23], s[2:3]
258; CHECK-NEXT:    s_mov_b64 s[20:21], s[0:1]
259; CHECK-NEXT:    s_mov_b32 s15, 20
260; CHECK-NEXT:    v_lshlrev_b32_e64 v2, s15, v2
261; CHECK-NEXT:    s_mov_b32 s15, 10
262; CHECK-NEXT:    v_lshlrev_b32_e64 v1, s15, v1
263; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
264; CHECK-NEXT:    ; implicit-def: $sgpr15
265; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
266; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
267; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
268; CHECK-NEXT:    s_endpgm
269entry:
270  %call = call i32 @caller_save_vgpr_spill_fp()
271  ret void
272}
273
274attributes #0 = { "frame-pointer"="none" noinline }
275attributes #1 = { "frame-pointer"="all" noinline }
276
277!llvm.module.flags = !{!0}
278!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
279