xref: /llvm-project/llvm/test/CodeGen/AMDGPU/cc-update.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX1100 %s
6
7define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
8; GFX803-LABEL: test_kern_empty:
9; GFX803:       ; %bb.0: ; %entry
10; GFX803-NEXT:    s_endpgm
11;
12; GFX900-LABEL: test_kern_empty:
13; GFX900:       ; %bb.0: ; %entry
14; GFX900-NEXT:    s_endpgm
15;
16; GFX1010-LABEL: test_kern_empty:
17; GFX1010:       ; %bb.0: ; %entry
18; GFX1010-NEXT:    s_endpgm
19;
20; GFX1100-LABEL: test_kern_empty:
21; GFX1100:       ; %bb.0: ; %entry
22; GFX1100-NEXT:    s_endpgm
23entry:
24  ret void
25}
26
27define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
28; GFX803-LABEL: test_kern_stack:
29; GFX803:       ; %bb.0: ; %entry
30; GFX803-NEXT:    s_add_u32 s0, s0, s17
31; GFX803-NEXT:    s_addc_u32 s1, s1, 0
32; GFX803-NEXT:    v_mov_b32_e32 v0, 0
33; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; GFX803-NEXT:    s_waitcnt vmcnt(0)
35; GFX803-NEXT:    s_endpgm
36;
37; GFX900-LABEL: test_kern_stack:
38; GFX900:       ; %bb.0: ; %entry
39; GFX900-NEXT:    s_add_u32 s0, s0, s17
40; GFX900-NEXT:    s_addc_u32 s1, s1, 0
41; GFX900-NEXT:    v_mov_b32_e32 v0, 0
42; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0
43; GFX900-NEXT:    s_waitcnt vmcnt(0)
44; GFX900-NEXT:    s_endpgm
45;
46; GFX1010-LABEL: test_kern_stack:
47; GFX1010:       ; %bb.0: ; %entry
48; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
49; GFX1010-NEXT:    s_add_u32 s0, s0, s17
50; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
51; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0
52; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
53; GFX1010-NEXT:    s_endpgm
54;
55; GFX1100-LABEL: test_kern_stack:
56; GFX1100:       ; %bb.0: ; %entry
57; GFX1100-NEXT:    v_mov_b32_e32 v0, 0
58; GFX1100-NEXT:    scratch_store_b32 off, v0, off dlc
59; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
60; GFX1100-NEXT:    s_endpgm
61entry:
62  %x = alloca i32, align 4, addrspace(5)
63  store volatile i32 0, ptr addrspace(5) %x, align 4
64  ret void
65}
66
67define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
68; GFX803-LABEL: test_kern_call:
69; GFX803:       ; %bb.0: ; %entry
70; GFX803-NEXT:    s_add_i32 s12, s12, s17
71; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
72; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
73; GFX803-NEXT:    s_add_u32 s0, s0, s17
74; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
75; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
76; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
77; GFX803-NEXT:    s_addc_u32 s1, s1, 0
78; GFX803-NEXT:    s_mov_b32 s13, s15
79; GFX803-NEXT:    s_mov_b32 s12, s14
80; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
81; GFX803-NEXT:    s_mov_b32 s14, s16
82; GFX803-NEXT:    s_mov_b32 s32, 0
83; GFX803-NEXT:    s_getpc_b64 s[18:19]
84; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
85; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
86; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
87; GFX803-NEXT:    s_endpgm
88;
89; GFX900-LABEL: test_kern_call:
90; GFX900:       ; %bb.0: ; %entry
91; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
92; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
93; GFX900-NEXT:    s_add_u32 s0, s0, s17
94; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
95; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
96; GFX900-NEXT:    s_addc_u32 s1, s1, 0
97; GFX900-NEXT:    s_mov_b32 s13, s15
98; GFX900-NEXT:    s_mov_b32 s12, s14
99; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
100; GFX900-NEXT:    s_mov_b32 s14, s16
101; GFX900-NEXT:    s_mov_b32 s32, 0
102; GFX900-NEXT:    s_getpc_b64 s[18:19]
103; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
104; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
105; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
106; GFX900-NEXT:    s_endpgm
107;
108; GFX1010-LABEL: test_kern_call:
109; GFX1010:       ; %bb.0: ; %entry
110; GFX1010-NEXT:    s_add_u32 s12, s12, s17
111; GFX1010-NEXT:    s_mov_b32 s32, 0
112; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
113; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
114; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
115; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
116; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
117; GFX1010-NEXT:    s_add_u32 s0, s0, s17
118; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
119; GFX1010-NEXT:    s_mov_b32 s13, s15
120; GFX1010-NEXT:    s_mov_b32 s12, s14
121; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
122; GFX1010-NEXT:    s_mov_b32 s14, s16
123; GFX1010-NEXT:    s_getpc_b64 s[18:19]
124; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
125; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
126; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
127; GFX1010-NEXT:    s_endpgm
128;
129; GFX1100-LABEL: test_kern_call:
130; GFX1100:       ; %bb.0: ; %entry
131; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
132; GFX1100-NEXT:    s_mov_b32 s12, s13
133; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
134; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
135; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
136; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
137; GFX1100-NEXT:    s_mov_b32 s13, s14
138; GFX1100-NEXT:    s_mov_b32 s14, s15
139; GFX1100-NEXT:    s_mov_b32 s32, 0
140; GFX1100-NEXT:    s_getpc_b64 s[16:17]
141; GFX1100-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
142; GFX1100-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
143; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
144; GFX1100-NEXT:    s_endpgm
145
146entry:
147  tail call void @ex() #0
148  ret void
149}
150
151define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
152; GFX803-LABEL: test_kern_stack_and_call:
153; GFX803:       ; %bb.0: ; %entry
154; GFX803-NEXT:    s_add_i32 s12, s12, s17
155; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
156; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
157; GFX803-NEXT:    s_add_u32 s0, s0, s17
158; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
159; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
160; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
161; GFX803-NEXT:    s_addc_u32 s1, s1, 0
162; GFX803-NEXT:    s_mov_b32 s13, s15
163; GFX803-NEXT:    s_mov_b32 s12, s14
164; GFX803-NEXT:    v_mov_b32_e32 v3, 0
165; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
166; GFX803-NEXT:    s_mov_b32 s14, s16
167; GFX803-NEXT:    s_movk_i32 s32, 0x400
168; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0
169; GFX803-NEXT:    s_waitcnt vmcnt(0)
170; GFX803-NEXT:    s_getpc_b64 s[18:19]
171; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
172; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
173; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
174; GFX803-NEXT:    s_endpgm
175;
176; GFX900-LABEL: test_kern_stack_and_call:
177; GFX900:       ; %bb.0: ; %entry
178; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
179; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
180; GFX900-NEXT:    s_add_u32 s0, s0, s17
181; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
182; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
183; GFX900-NEXT:    s_addc_u32 s1, s1, 0
184; GFX900-NEXT:    s_mov_b32 s13, s15
185; GFX900-NEXT:    s_mov_b32 s12, s14
186; GFX900-NEXT:    v_mov_b32_e32 v3, 0
187; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
188; GFX900-NEXT:    s_mov_b32 s14, s16
189; GFX900-NEXT:    s_movk_i32 s32, 0x400
190; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0
191; GFX900-NEXT:    s_waitcnt vmcnt(0)
192; GFX900-NEXT:    s_getpc_b64 s[18:19]
193; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
194; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
195; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
196; GFX900-NEXT:    s_endpgm
197;
198; GFX1010-LABEL: test_kern_stack_and_call:
199; GFX1010:       ; %bb.0: ; %entry
200; GFX1010-NEXT:    s_add_u32 s12, s12, s17
201; GFX1010-NEXT:    s_movk_i32 s32, 0x200
202; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
203; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
204; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
205; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
206; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
207; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
208; GFX1010-NEXT:    s_add_u32 s0, s0, s17
209; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
210; GFX1010-NEXT:    s_mov_b32 s13, s15
211; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
212; GFX1010-NEXT:    s_mov_b32 s12, s14
213; GFX1010-NEXT:    s_mov_b32 s14, s16
214; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0
215; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
216; GFX1010-NEXT:    s_getpc_b64 s[18:19]
217; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
218; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
219; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
220; GFX1010-NEXT:    s_endpgm
221;
222; GFX1100-LABEL: test_kern_stack_and_call:
223; GFX1100:       ; %bb.0: ; %entry
224; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
225; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
226; GFX1100-NEXT:    s_mov_b32 s12, s13
227; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
228; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
229; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
230; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
231; GFX1100-NEXT:    s_mov_b32 s13, s14
232; GFX1100-NEXT:    s_mov_b32 s14, s15
233; GFX1100-NEXT:    s_mov_b32 s32, 16
234; GFX1100-NEXT:    scratch_store_b32 off, v1, off dlc
235; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
236; GFX1100-NEXT:    s_getpc_b64 s[16:17]
237; GFX1100-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
238; GFX1100-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
239; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
240; GFX1100-NEXT:    s_endpgm
241
242entry:
243  %x = alloca i32, align 4, addrspace(5)
244  store volatile i32 0, ptr addrspace(5) %x, align 4
245  tail call void @ex() #0
246  ret void
247}
248
249define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
250; GFX803-LABEL: test_force_fp_kern_empty:
251; GFX803:       ; %bb.0: ; %entry
252; GFX803-NEXT:    s_mov_b32 s33, 0
253; GFX803-NEXT:    s_endpgm
254;
255; GFX900-LABEL: test_force_fp_kern_empty:
256; GFX900:       ; %bb.0: ; %entry
257; GFX900-NEXT:    s_mov_b32 s33, 0
258; GFX900-NEXT:    s_endpgm
259;
260; GFX1010-LABEL: test_force_fp_kern_empty:
261; GFX1010:       ; %bb.0: ; %entry
262; GFX1010-NEXT:    s_mov_b32 s33, 0
263; GFX1010-NEXT:    s_endpgm
264;
265; GFX1100-LABEL: test_force_fp_kern_empty:
266; GFX1100:       ; %bb.0: ; %entry
267; GFX1100-NEXT:    s_mov_b32 s33, 0
268; GFX1100-NEXT:    s_endpgm
269
270entry:
271  ret void
272}
273
274define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
275; GFX803-LABEL: test_force_fp_kern_stack:
276; GFX803:       ; %bb.0: ; %entry
277; GFX803-NEXT:    s_add_u32 s0, s0, s17
278; GFX803-NEXT:    s_mov_b32 s33, 0
279; GFX803-NEXT:    s_addc_u32 s1, s1, 0
280; GFX803-NEXT:    v_mov_b32_e32 v0, 0
281; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33
282; GFX803-NEXT:    s_waitcnt vmcnt(0)
283; GFX803-NEXT:    s_endpgm
284;
285; GFX900-LABEL: test_force_fp_kern_stack:
286; GFX900:       ; %bb.0: ; %entry
287; GFX900-NEXT:    s_add_u32 s0, s0, s17
288; GFX900-NEXT:    s_mov_b32 s33, 0
289; GFX900-NEXT:    s_addc_u32 s1, s1, 0
290; GFX900-NEXT:    v_mov_b32_e32 v0, 0
291; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33
292; GFX900-NEXT:    s_waitcnt vmcnt(0)
293; GFX900-NEXT:    s_endpgm
294;
295; GFX1010-LABEL: test_force_fp_kern_stack:
296; GFX1010:       ; %bb.0: ; %entry
297; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
298; GFX1010-NEXT:    s_add_u32 s0, s0, s17
299; GFX1010-NEXT:    s_mov_b32 s33, 0
300; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
301; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33
302; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
303; GFX1010-NEXT:    s_endpgm
304;
305; GFX1100-LABEL: test_force_fp_kern_stack:
306; GFX1100:       ; %bb.0: ; %entry
307; GFX1100-NEXT:    v_mov_b32_e32 v0, 0
308; GFX1100-NEXT:    s_mov_b32 s33, 0
309; GFX1100-NEXT:    scratch_store_b32 off, v0, s33 dlc
310; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
311; GFX1100-NEXT:    s_endpgm
312entry:
313  %x = alloca i32, align 4, addrspace(5)
314  store volatile i32 0, ptr addrspace(5) %x, align 4
315  ret void
316}
317
318define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
319; GFX803-LABEL: test_force_fp_kern_call:
320; GFX803:       ; %bb.0: ; %entry
321; GFX803-NEXT:    s_add_i32 s12, s12, s17
322; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
323; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
324; GFX803-NEXT:    s_add_u32 s0, s0, s17
325; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
326; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
327; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
328; GFX803-NEXT:    s_addc_u32 s1, s1, 0
329; GFX803-NEXT:    s_mov_b32 s13, s15
330; GFX803-NEXT:    s_mov_b32 s12, s14
331; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
332; GFX803-NEXT:    s_mov_b32 s14, s16
333; GFX803-NEXT:    s_mov_b32 s33, 0
334; GFX803-NEXT:    s_mov_b32 s32, 0
335; GFX803-NEXT:    s_getpc_b64 s[18:19]
336; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
337; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
338; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
339; GFX803-NEXT:    s_endpgm
340;
341; GFX900-LABEL: test_force_fp_kern_call:
342; GFX900:       ; %bb.0: ; %entry
343; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
344; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
345; GFX900-NEXT:    s_add_u32 s0, s0, s17
346; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
347; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
348; GFX900-NEXT:    s_addc_u32 s1, s1, 0
349; GFX900-NEXT:    s_mov_b32 s13, s15
350; GFX900-NEXT:    s_mov_b32 s12, s14
351; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
352; GFX900-NEXT:    s_mov_b32 s14, s16
353; GFX900-NEXT:    s_mov_b32 s33, 0
354; GFX900-NEXT:    s_mov_b32 s32, 0
355; GFX900-NEXT:    s_getpc_b64 s[18:19]
356; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
357; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
358; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
359; GFX900-NEXT:    s_endpgm
360;
361; GFX1010-LABEL: test_force_fp_kern_call:
362; GFX1010:       ; %bb.0: ; %entry
363; GFX1010-NEXT:    s_add_u32 s12, s12, s17
364; GFX1010-NEXT:    s_mov_b32 s33, 0
365; GFX1010-NEXT:    s_mov_b32 s32, 0
366; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
367; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
368; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
369; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
370; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
371; GFX1010-NEXT:    s_add_u32 s0, s0, s17
372; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
373; GFX1010-NEXT:    s_mov_b32 s13, s15
374; GFX1010-NEXT:    s_mov_b32 s12, s14
375; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
376; GFX1010-NEXT:    s_mov_b32 s14, s16
377; GFX1010-NEXT:    s_getpc_b64 s[18:19]
378; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
379; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
380; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
381; GFX1010-NEXT:    s_endpgm
382;
383; GFX1100-LABEL: test_force_fp_kern_call:
384; GFX1100:       ; %bb.0: ; %entry
385; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
386; GFX1100-NEXT:    s_mov_b32 s12, s13
387; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
388; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
389; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
390; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
391; GFX1100-NEXT:    s_mov_b32 s13, s14
392; GFX1100-NEXT:    s_mov_b32 s14, s15
393; GFX1100-NEXT:    s_mov_b32 s33, 0
394; GFX1100-NEXT:    s_mov_b32 s32, 0
395; GFX1100-NEXT:    s_getpc_b64 s[16:17]
396; GFX1100-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
397; GFX1100-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
398; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
399; GFX1100-NEXT:    s_endpgm
400; GFX1010-NEXT    s_add_u32 s12, s12, s17
401; GFX1010-NEXT    s_mov_b32 s33, 0
402; GFX1010-NEXT    s_mov_b32 s32, 0
403; GFX1010-NEXT    s_addc_u32 s13, s13, 0
404; GFX1010-NEXT    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
405; GFX1010-NEXT    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
406; GFX1010-NEXT    v_lshlrev_b32_e32 v2, 20, v2
407; GFX1010-NEXT    v_lshlrev_b32_e32 v1, 10, v1
408; GFX1010-NEXT    s_add_u32 s0, s0, s17
409; GFX1010-NEXT    s_addc_u32 s1, s1, 0
410; GFX1010-NEXT    s_mov_b32 s12, s14
411; GFX1010-NEXT    s_mov_b32 s13, s15
412; GFX1010-NEXT    v_or3_b32 v31, v0, v1, v2
413; GFX1010-NEXT    s_mov_b32 s14, s16
414; GFX1010-NEXT    s_getpc_b64 s[18:19]
415; GFX1010-NEXT    s_add_u32 s18, s18, ex@rel32@lo+4
416; GFX1010-NEXT    s_addc_u32 s19, s19, ex@rel32@hi+12
417; GFX1010-NEXT    s_swappc_b64 s[30:31], s[18:19]
418; GFX1010-NEXT    s_endpgm
419entry:
420  tail call void @ex() #2
421  ret void
422}
423
424define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
425; GFX803-LABEL: test_force_fp_kern_stack_and_call:
426; GFX803:       ; %bb.0: ; %entry
427; GFX803-NEXT:    s_add_i32 s12, s12, s17
428; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
429; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
430; GFX803-NEXT:    s_add_u32 s0, s0, s17
431; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
432; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
433; GFX803-NEXT:    s_mov_b32 s33, 0
434; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
435; GFX803-NEXT:    s_addc_u32 s1, s1, 0
436; GFX803-NEXT:    s_mov_b32 s13, s15
437; GFX803-NEXT:    s_mov_b32 s12, s14
438; GFX803-NEXT:    v_mov_b32_e32 v3, 0
439; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
440; GFX803-NEXT:    s_mov_b32 s14, s16
441; GFX803-NEXT:    s_movk_i32 s32, 0x400
442; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33
443; GFX803-NEXT:    s_waitcnt vmcnt(0)
444; GFX803-NEXT:    s_getpc_b64 s[18:19]
445; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
446; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
447; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
448; GFX803-NEXT:    s_endpgm
449;
450; GFX900-LABEL: test_force_fp_kern_stack_and_call:
451; GFX900:       ; %bb.0: ; %entry
452; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
453; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
454; GFX900-NEXT:    s_add_u32 s0, s0, s17
455; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
456; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
457; GFX900-NEXT:    s_mov_b32 s33, 0
458; GFX900-NEXT:    s_addc_u32 s1, s1, 0
459; GFX900-NEXT:    s_mov_b32 s13, s15
460; GFX900-NEXT:    s_mov_b32 s12, s14
461; GFX900-NEXT:    v_mov_b32_e32 v3, 0
462; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
463; GFX900-NEXT:    s_mov_b32 s14, s16
464; GFX900-NEXT:    s_movk_i32 s32, 0x400
465; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33
466; GFX900-NEXT:    s_waitcnt vmcnt(0)
467; GFX900-NEXT:    s_getpc_b64 s[18:19]
468; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
469; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
470; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
471; GFX900-NEXT:    s_endpgm
472;
473; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
474; GFX1010:       ; %bb.0: ; %entry
475; GFX1010-NEXT:    s_add_u32 s12, s12, s17
476; GFX1010-NEXT:    s_mov_b32 s33, 0
477; GFX1010-NEXT:    s_movk_i32 s32, 0x200
478; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
479; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
480; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
481; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
482; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
483; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
484; GFX1010-NEXT:    s_add_u32 s0, s0, s17
485; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
486; GFX1010-NEXT:    s_mov_b32 s13, s15
487; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
488; GFX1010-NEXT:    s_mov_b32 s12, s14
489; GFX1010-NEXT:    s_mov_b32 s14, s16
490; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33
491; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
492; GFX1010-NEXT:    s_getpc_b64 s[18:19]
493; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
494; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
495; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
496; GFX1010-NEXT:    s_endpgm
497;
498; GFX1100-LABEL: test_force_fp_kern_stack_and_call:
499; GFX1100:       ; %bb.0: ; %entry
500; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
501; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
502; GFX1100-NEXT:    s_mov_b32 s33, 0
503; GFX1100-NEXT:    s_mov_b32 s12, s13
504; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
505; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
506; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
507; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
508; GFX1100-NEXT:    s_mov_b32 s13, s14
509; GFX1100-NEXT:    s_mov_b32 s14, s15
510; GFX1100-NEXT:    s_mov_b32 s32, 16
511; GFX1100-NEXT:    scratch_store_b32 off, v1, s33 dlc
512; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
513; GFX1100-NEXT:    s_getpc_b64 s[16:17]
514; GFX1100-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
515; GFX1100-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
516; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
517; GFX1100-NEXT:    s_endpgm
518entry:
519  %x = alloca i32, align 4, addrspace(5)
520  store volatile i32 0, ptr addrspace(5) %x, align 4
521  tail call void @ex() #2
522  ret void
523}
524
525define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
526; GFX803-LABEL: test_sgpr_offset_kernel:
527; GFX803:       ; %bb.0: ; %entry
528; GFX803-NEXT:    s_add_u32 s0, s0, s17
529; GFX803-NEXT:    s_addc_u32 s1, s1, 0
530; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
531; GFX803-NEXT:    s_waitcnt vmcnt(0)
532; GFX803-NEXT:    s_mov_b32 s4, 0x40000
533; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
534; GFX803-NEXT:    ;;#ASMSTART
535; GFX803-NEXT:    ;;#ASMEND
536; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
537; GFX803-NEXT:    s_waitcnt vmcnt(0)
538; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
539; GFX803-NEXT:    s_waitcnt vmcnt(0)
540; GFX803-NEXT:    s_endpgm
541;
542; GFX900-LABEL: test_sgpr_offset_kernel:
543; GFX900:       ; %bb.0: ; %entry
544; GFX900-NEXT:    s_add_u32 s0, s0, s17
545; GFX900-NEXT:    s_addc_u32 s1, s1, 0
546; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
547; GFX900-NEXT:    s_waitcnt vmcnt(0)
548; GFX900-NEXT:    s_mov_b32 s4, 0x40000
549; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
550; GFX900-NEXT:    ;;#ASMSTART
551; GFX900-NEXT:    ;;#ASMEND
552; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
553; GFX900-NEXT:    s_waitcnt vmcnt(0)
554; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
555; GFX900-NEXT:    s_waitcnt vmcnt(0)
556; GFX900-NEXT:    s_endpgm
557;
558; GFX1010-LABEL: test_sgpr_offset_kernel:
559; GFX1010:       ; %bb.0: ; %entry
560; GFX1010-NEXT:    s_add_u32 s0, s0, s17
561; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
562; GFX1010-NEXT:    s_mov_b32 s4, 0x20000
563; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
564; GFX1010-NEXT:    s_waitcnt vmcnt(0)
565; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
566; GFX1010-NEXT:    ;;#ASMSTART
567; GFX1010-NEXT:    ;;#ASMEND
568; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
569; GFX1010-NEXT:    s_waitcnt vmcnt(0)
570; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
571; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
572; GFX1010-NEXT:    s_endpgm
573;
574; GFX1100-LABEL: test_sgpr_offset_kernel:
575; GFX1100:       ; %bb.0: ; %entry
576; GFX1100-NEXT:    scratch_load_b32 v0, off, off offset:8 glc dlc
577; GFX1100-NEXT:    s_waitcnt vmcnt(0)
578; GFX1100-NEXT:    s_movk_i32 s0, 0x1000
579; GFX1100-NEXT:    scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill
580; GFX1100-NEXT:    ;;#ASMSTART
581; GFX1100-NEXT:    ;;#ASMEND
582; GFX1100-NEXT:    scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload
583; GFX1100-NEXT:    s_waitcnt vmcnt(0)
584; GFX1100-NEXT:    scratch_store_b32 off, v0, off offset:8 dlc
585; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
586; GFX1100-NEXT:    s_endpgm
587entry:
588  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
589  ; fit in the instruction, and has to live in the SGPR offset.
590  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
591
592  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
593  ; 0x40000 / 64 = 4096 (for wave64)
594  ; CHECK: s_add_u32 s6, s7, 0x40000
595  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
596  %a = load volatile i32, ptr addrspace(5) %aptr
597
598  ; Force %a to spill
599  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
600
601  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
602  store volatile i32 %a, ptr addrspace(5) %outptr
603
604  ret void
605}
606
607declare hidden void @ex() local_unnamed_addr #0
608
609attributes #0 = { nounwind }
610attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
611attributes #2 = { nounwind "frame-pointer"="all" }
612
613!llvm.module.flags = !{!0}
614!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
615