xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=MUBUF %s
3; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=FLATSCR %s
4
5; Test end-to-end codegen for outgoing arguments passed on the
6; stack. This test is likely redundant when all DAG and GlobalISel
7; tests are unified.
8
9declare hidden void @external_void_func_v16i32_v16i32_v4i32(<16 x i32>, <16 x i32>, <4 x i32>) #0
10declare hidden void @external_void_func_byval(ptr addrspace(5) byval([16 x i32])) #0
11
12define amdgpu_kernel void @kernel_caller_stack() {
13; MUBUF-LABEL: kernel_caller_stack:
14; MUBUF:       ; %bb.0:
15; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
16; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
17; MUBUF-NEXT:    s_add_u32 s0, s0, s17
18; MUBUF-NEXT:    s_mov_b32 s32, 0
19; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
20; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
21; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
22; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
23; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
24; MUBUF-NEXT:    v_mov_b32_e32 v0, 11
25; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
26; MUBUF-NEXT:    v_mov_b32_e32 v0, 12
27; MUBUF-NEXT:    s_getpc_b64 s[4:5]
28; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
29; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
30; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
31; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
32; MUBUF-NEXT:    s_endpgm
33;
34; FLATSCR-LABEL: kernel_caller_stack:
35; FLATSCR:       ; %bb.0:
36; FLATSCR-NEXT:    s_mov_b32 s32, 0
37; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
38; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
39; FLATSCR-NEXT:    s_add_u32 s0, s32, 4
40; FLATSCR-NEXT:    v_mov_b32_e32 v0, 9
41; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
42; FLATSCR-NEXT:    s_add_u32 s0, s32, 8
43; FLATSCR-NEXT:    v_mov_b32_e32 v0, 10
44; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
45; FLATSCR-NEXT:    s_add_u32 s0, s32, 12
46; FLATSCR-NEXT:    v_mov_b32_e32 v0, 11
47; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
48; FLATSCR-NEXT:    s_add_u32 s2, s32, 16
49; FLATSCR-NEXT:    v_mov_b32_e32 v0, 12
50; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
51; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
52; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
53; FLATSCR-NEXT:    scratch_store_dword off, v0, s2
54; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
55; FLATSCR-NEXT:    s_endpgm
56  call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
57  ret void
58}
59
60define amdgpu_kernel void @kernel_caller_byval() {
61; MUBUF-LABEL: kernel_caller_byval:
62; MUBUF:       ; %bb.0:
63; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
64; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
65; MUBUF-NEXT:    s_add_u32 s0, s0, s17
66; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
67; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
68; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0
69; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
70; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
71; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
72; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
73; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:20
74; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:24
75; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:28
76; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:32
77; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:36
78; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:40
79; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:44
80; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:48
81; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:52
82; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:56
83; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:60
84; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:64
85; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:68
86; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:72
87; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:76
88; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:80
89; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:84
90; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:88
91; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:92
92; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:96
93; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:100
94; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:104
95; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:108
96; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:112
97; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:116
98; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:120
99; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:124
100; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0
101; MUBUF-NEXT:    s_nop 0
102; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
103; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:8
104; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:12
105; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:16
106; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:20
107; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:24
108; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:28
109; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:32
110; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:36
111; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:40
112; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:44
113; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:48
114; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:52
115; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:56
116; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:60
117; MUBUF-NEXT:    s_movk_i32 s32, 0x1400
118; MUBUF-NEXT:    s_getpc_b64 s[4:5]
119; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
120; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
121; MUBUF-NEXT:    s_waitcnt vmcnt(15)
122; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32
123; MUBUF-NEXT:    s_waitcnt vmcnt(15)
124; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
125; MUBUF-NEXT:    s_waitcnt vmcnt(15)
126; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
127; MUBUF-NEXT:    s_waitcnt vmcnt(15)
128; MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
129; MUBUF-NEXT:    s_waitcnt vmcnt(15)
130; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
131; MUBUF-NEXT:    s_waitcnt vmcnt(15)
132; MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
133; MUBUF-NEXT:    s_waitcnt vmcnt(15)
134; MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:24
135; MUBUF-NEXT:    s_waitcnt vmcnt(15)
136; MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28
137; MUBUF-NEXT:    s_waitcnt vmcnt(15)
138; MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:32
139; MUBUF-NEXT:    s_waitcnt vmcnt(15)
140; MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36
141; MUBUF-NEXT:    s_waitcnt vmcnt(15)
142; MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:40
143; MUBUF-NEXT:    s_waitcnt vmcnt(15)
144; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44
145; MUBUF-NEXT:    s_waitcnt vmcnt(15)
146; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:48
147; MUBUF-NEXT:    s_waitcnt vmcnt(15)
148; MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52
149; MUBUF-NEXT:    s_waitcnt vmcnt(15)
150; MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56
151; MUBUF-NEXT:    s_waitcnt vmcnt(15)
152; MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:60
153; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
154; MUBUF-NEXT:    s_endpgm
155;
156; FLATSCR-LABEL: kernel_caller_byval:
157; FLATSCR:       ; %bb.0:
158; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
159; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
160; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
161; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
162; FLATSCR-NEXT:    s_mov_b32 s0, 0
163; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
164; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:8
165; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
166; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:24
167; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:32
168; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:40
169; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:48
170; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:56
171; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:64
172; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:72
173; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:80
174; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:88
175; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:96
176; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:104
177; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:112
178; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:120
179; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0
180; FLATSCR-NEXT:    s_nop 0
181; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:8
182; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:16
183; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:24
184; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:32
185; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:40
186; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:48
187; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:56
188; FLATSCR-NEXT:    s_movk_i32 s32, 0x50
189; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
190; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
191; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
192; FLATSCR-NEXT:    s_add_u32 s2, s32, 8
193; FLATSCR-NEXT:    s_add_u32 s3, s32, 16
194; FLATSCR-NEXT:    s_add_u32 s4, s32, 24
195; FLATSCR-NEXT:    s_add_u32 s5, s32, 32
196; FLATSCR-NEXT:    s_add_u32 s6, s32, 40
197; FLATSCR-NEXT:    s_add_u32 s7, s32, 48
198; FLATSCR-NEXT:    s_add_u32 s8, s32, 56
199; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
200; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32
201; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
202; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s2
203; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
204; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[4:5], s3
205; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
206; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[6:7], s4
207; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
208; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[8:9], s5
209; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
210; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[10:11], s6
211; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
212; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[12:13], s7
213; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
214; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[14:15], s8
215; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
216; FLATSCR-NEXT:    s_endpgm
217  %alloca = alloca [16 x i32], align 4, addrspace(5)
218  call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %alloca, i8 0, i32 128, i1 false)
219  call void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) %alloca)
220  ret void
221}
222
223define void @func_caller_stack() {
224; MUBUF-LABEL: func_caller_stack:
225; MUBUF:       ; %bb.0:
226; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; MUBUF-NEXT:    s_mov_b32 s4, s33
228; MUBUF-NEXT:    s_mov_b32 s33, s32
229; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
230; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
231; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
232; MUBUF-NEXT:    s_addk_i32 s32, 0x400
233; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
234; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
235; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
236; MUBUF-NEXT:    v_writelane_b32 v40, s4, 2
237; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
238; MUBUF-NEXT:    v_mov_b32_e32 v0, 11
239; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
240; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
241; MUBUF-NEXT:    v_mov_b32_e32 v0, 12
242; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
243; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
244; MUBUF-NEXT:    s_getpc_b64 s[4:5]
245; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
246; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
247; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
248; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
249; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
250; MUBUF-NEXT:    s_mov_b32 s32, s33
251; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
252; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
253; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
254; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
255; MUBUF-NEXT:    s_mov_b32 s33, s4
256; MUBUF-NEXT:    s_waitcnt vmcnt(0)
257; MUBUF-NEXT:    s_setpc_b64 s[30:31]
258;
259; FLATSCR-LABEL: func_caller_stack:
260; FLATSCR:       ; %bb.0:
261; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; FLATSCR-NEXT:    s_mov_b32 s0, s33
263; FLATSCR-NEXT:    s_mov_b32 s33, s32
264; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
265; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
266; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
267; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
268; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 2
269; FLATSCR-NEXT:    s_add_u32 s0, s32, 4
270; FLATSCR-NEXT:    v_mov_b32_e32 v0, 9
271; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
272; FLATSCR-NEXT:    s_add_u32 s0, s32, 8
273; FLATSCR-NEXT:    v_mov_b32_e32 v0, 10
274; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
275; FLATSCR-NEXT:    s_add_u32 s0, s32, 12
276; FLATSCR-NEXT:    v_mov_b32_e32 v0, 11
277; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
278; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
279; FLATSCR-NEXT:    s_add_u32 s0, s32, 16
280; FLATSCR-NEXT:    v_mov_b32_e32 v0, 12
281; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
282; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
283; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
284; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
285; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
286; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
287; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
288; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
289; FLATSCR-NEXT:    s_mov_b32 s32, s33
290; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
291; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
292; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
293; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
294; FLATSCR-NEXT:    s_mov_b32 s33, s0
295; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
296; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
297  call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
298  ret void
299}
300
301define void @func_caller_byval(ptr addrspace(5) %argptr) {
302; MUBUF-LABEL: func_caller_byval:
303; MUBUF:       ; %bb.0:
304; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305; MUBUF-NEXT:    s_mov_b32 s4, s33
306; MUBUF-NEXT:    s_mov_b32 s33, s32
307; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
308; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
309; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
310; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
311; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
312; MUBUF-NEXT:    s_addk_i32 s32, 0x400
313; MUBUF-NEXT:    v_writelane_b32 v40, s4, 2
314; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
315; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
316; MUBUF-NEXT:    s_getpc_b64 s[4:5]
317; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
318; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
319; MUBUF-NEXT:    s_waitcnt vmcnt(1)
320; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
321; MUBUF-NEXT:    s_waitcnt vmcnt(1)
322; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
323; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:8
324; MUBUF-NEXT:    s_nop 0
325; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:12
326; MUBUF-NEXT:    s_waitcnt vmcnt(1)
327; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
328; MUBUF-NEXT:    s_waitcnt vmcnt(1)
329; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12
330; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:16
331; MUBUF-NEXT:    s_nop 0
332; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:20
333; MUBUF-NEXT:    s_waitcnt vmcnt(1)
334; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16
335; MUBUF-NEXT:    s_waitcnt vmcnt(1)
336; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
337; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:24
338; MUBUF-NEXT:    s_nop 0
339; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:28
340; MUBUF-NEXT:    s_waitcnt vmcnt(1)
341; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:24
342; MUBUF-NEXT:    s_waitcnt vmcnt(1)
343; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28
344; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:32
345; MUBUF-NEXT:    s_nop 0
346; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:36
347; MUBUF-NEXT:    s_waitcnt vmcnt(1)
348; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:32
349; MUBUF-NEXT:    s_waitcnt vmcnt(1)
350; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:36
351; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:40
352; MUBUF-NEXT:    s_nop 0
353; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:44
354; MUBUF-NEXT:    s_waitcnt vmcnt(1)
355; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:40
356; MUBUF-NEXT:    s_waitcnt vmcnt(1)
357; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:44
358; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:48
359; MUBUF-NEXT:    s_nop 0
360; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:52
361; MUBUF-NEXT:    s_waitcnt vmcnt(1)
362; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:48
363; MUBUF-NEXT:    s_waitcnt vmcnt(1)
364; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:52
365; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:56
366; MUBUF-NEXT:    s_nop 0
367; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:60
368; MUBUF-NEXT:    s_waitcnt vmcnt(1)
369; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56
370; MUBUF-NEXT:    s_waitcnt vmcnt(1)
371; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60
372; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
373; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
374; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
375; MUBUF-NEXT:    s_mov_b32 s32, s33
376; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
377; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
378; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
379; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
380; MUBUF-NEXT:    s_mov_b32 s33, s4
381; MUBUF-NEXT:    s_waitcnt vmcnt(0)
382; MUBUF-NEXT:    s_setpc_b64 s[30:31]
383;
384; FLATSCR-LABEL: func_caller_byval:
385; FLATSCR:       ; %bb.0:
386; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; FLATSCR-NEXT:    s_mov_b32 s0, s33
388; FLATSCR-NEXT:    s_mov_b32 s33, s32
389; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
390; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
391; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
392; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off
393; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
394; FLATSCR-NEXT:    v_add_u32_e32 v3, 8, v0
395; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 2
396; FLATSCR-NEXT:    s_add_u32 s0, s32, 8
397; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
398; FLATSCR-NEXT:    s_add_u32 s2, s32, 56
399; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
400; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
401; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32
402; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v3, off
403; FLATSCR-NEXT:    v_add_u32_e32 v3, 16, v0
404; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
405; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
406; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v3, off
407; FLATSCR-NEXT:    s_add_u32 s0, s32, 16
408; FLATSCR-NEXT:    v_add_u32_e32 v3, 24, v0
409; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
410; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
411; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v3, off
412; FLATSCR-NEXT:    s_add_u32 s0, s32, 24
413; FLATSCR-NEXT:    v_add_u32_e32 v3, 32, v0
414; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
415; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
416; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v3, off
417; FLATSCR-NEXT:    s_add_u32 s0, s32, 32
418; FLATSCR-NEXT:    v_add_u32_e32 v3, 40, v0
419; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
420; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
421; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v3, off
422; FLATSCR-NEXT:    s_add_u32 s0, s32, 40
423; FLATSCR-NEXT:    v_add_u32_e32 v3, 48, v0
424; FLATSCR-NEXT:    v_add_u32_e32 v0, 56, v0
425; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
426; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
427; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v3, off
428; FLATSCR-NEXT:    s_add_u32 s0, s32, 48
429; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
430; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
431; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
432; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
433; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
434; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
435; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
436; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s2
437; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
438; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
439; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
440; FLATSCR-NEXT:    s_mov_b32 s32, s33
441; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
442; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
443; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
444; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
445; FLATSCR-NEXT:    s_mov_b32 s33, s0
446; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
447; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
448  call void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) %argptr)
449  ret void
450}
451
452declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #1
453
454attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
455attributes #1 = { argmemonly nofree nounwind willreturn writeonly }
456