xref: /llvm-project/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
4
5declare hidden void @external_void_func_i8_inreg(i8 inreg) #0
6declare hidden void @external_void_func_i16_inreg(i32 inreg) #0
7declare hidden void @external_void_func_i32_inreg(i32 inreg) #0
8declare hidden void @external_void_func_i64_inreg(i64 inreg) #0
9declare hidden void @external_void_func_v2i32_inreg(<2 x i32> inreg) #0
10declare hidden void @external_void_func_v3i32_inreg(<3 x i32> inreg) #0
11declare hidden void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0
12declare hidden void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0
13declare hidden void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0
14declare hidden void @external_void_func_f16_inreg(half inreg) #0
15declare hidden void @external_void_func_bf16_inreg(bfloat inreg) #0
16declare hidden void @external_void_func_f32_inreg(float inreg) #0
17declare hidden void @external_void_func_f64_inreg(double inreg) #0
18declare hidden void @external_void_func_v2f16_inreg(<2 x half> inreg) #0
19declare hidden void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0
20declare hidden void @external_void_func_v3f16_inreg(<3 x half> inreg) #0
21declare hidden void @external_void_func_v4f16_inreg(<4 x half> inreg) #0
22
23declare hidden void @external_void_func_p0_inreg(ptr inreg) #0
24declare hidden void @external_void_func_p1_inreg(ptr addrspace(1) inreg) #0
25declare hidden void @external_void_func_p3_inreg(ptr addrspace(3) inreg) #0
26declare hidden void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg) #0
27declare hidden void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg) #0
28
29declare hidden void @external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg, i32 inreg, i64 inreg) #0
30
31declare hidden void @external_void_func_a15i32_inreg([13 x i32] inreg) #0
32declare hidden void @external_void_func_a15i32_inreg_i32_inreg__noimplicit([13 x i32] inreg, i32 inreg) #1
33
34define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
35; GFX9-LABEL: test_call_external_void_func_i8_inreg:
36; GFX9:       ; %bb.0:
37; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX9-NEXT:    s_mov_b32 s17, s33
39; GFX9-NEXT:    s_mov_b32 s33, s32
40; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
41; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
42; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
43; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
44; GFX9-NEXT:    s_addk_i32 s32, 0x400
45; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
46; GFX9-NEXT:    s_mov_b32 s0, s16
47; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
48; GFX9-NEXT:    s_getpc_b64 s[18:19]
49; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i8_inreg@rel32@lo+4
50; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i8_inreg@rel32@hi+12
51; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
52; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
53; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
54; GFX9-NEXT:    s_mov_b32 s32, s33
55; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
56; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
57; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
58; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
59; GFX9-NEXT:    s_mov_b32 s33, s4
60; GFX9-NEXT:    s_waitcnt vmcnt(0)
61; GFX9-NEXT:    s_setpc_b64 s[30:31]
62;
63; GFX11-LABEL: test_call_external_void_func_i8_inreg:
64; GFX11:       ; %bb.0:
65; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66; GFX11-NEXT:    s_mov_b32 s1, s33
67; GFX11-NEXT:    s_mov_b32 s33, s32
68; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
69; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
70; GFX11-NEXT:    s_mov_b32 exec_lo, s2
71; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
72; GFX11-NEXT:    s_add_i32 s32, s32, 16
73; GFX11-NEXT:    s_getpc_b64 s[2:3]
74; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_inreg@rel32@lo+4
75; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_inreg@rel32@hi+12
76; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
77; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
78; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
79; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
80; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
81; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
82; GFX11-NEXT:    s_mov_b32 s32, s33
83; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
84; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
85; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
86; GFX11-NEXT:    s_mov_b32 exec_lo, s1
87; GFX11-NEXT:    s_mov_b32 s33, s0
88; GFX11-NEXT:    s_waitcnt vmcnt(0)
89; GFX11-NEXT:    s_setpc_b64 s[30:31]
90  call void @external_void_func_i8_inreg(i8 inreg %arg)
91  ret void
92}
93
94define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
95; GFX9-LABEL: test_call_external_void_func_i16_inreg:
96; GFX9:       ; %bb.0:
97; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GFX9-NEXT:    s_mov_b32 s17, s33
99; GFX9-NEXT:    s_mov_b32 s33, s32
100; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
101; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
102; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
103; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
104; GFX9-NEXT:    s_addk_i32 s32, 0x400
105; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
106; GFX9-NEXT:    s_mov_b32 s0, s16
107; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
108; GFX9-NEXT:    s_getpc_b64 s[18:19]
109; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i16_inreg@rel32@lo+4
110; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i16_inreg@rel32@hi+12
111; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
112; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
113; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
114; GFX9-NEXT:    s_mov_b32 s32, s33
115; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
116; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
117; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
118; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
119; GFX9-NEXT:    s_mov_b32 s33, s4
120; GFX9-NEXT:    s_waitcnt vmcnt(0)
121; GFX9-NEXT:    s_setpc_b64 s[30:31]
122;
123; GFX11-LABEL: test_call_external_void_func_i16_inreg:
124; GFX11:       ; %bb.0:
125; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX11-NEXT:    s_mov_b32 s1, s33
127; GFX11-NEXT:    s_mov_b32 s33, s32
128; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
129; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
130; GFX11-NEXT:    s_mov_b32 exec_lo, s2
131; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
132; GFX11-NEXT:    s_add_i32 s32, s32, 16
133; GFX11-NEXT:    s_getpc_b64 s[2:3]
134; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_inreg@rel32@lo+4
135; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_inreg@rel32@hi+12
136; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
137; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
138; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
139; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
140; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
141; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
142; GFX11-NEXT:    s_mov_b32 s32, s33
143; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
144; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
145; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
146; GFX11-NEXT:    s_mov_b32 exec_lo, s1
147; GFX11-NEXT:    s_mov_b32 s33, s0
148; GFX11-NEXT:    s_waitcnt vmcnt(0)
149; GFX11-NEXT:    s_setpc_b64 s[30:31]
150  call void @external_void_func_i16_inreg(i16 inreg %arg)
151  ret void
152}
153
154define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
155; GFX9-LABEL: test_call_external_void_func_i32_inreg:
156; GFX9:       ; %bb.0:
157; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; GFX9-NEXT:    s_mov_b32 s17, s33
159; GFX9-NEXT:    s_mov_b32 s33, s32
160; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
161; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
162; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
163; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
164; GFX9-NEXT:    s_addk_i32 s32, 0x400
165; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
166; GFX9-NEXT:    s_mov_b32 s0, s16
167; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
168; GFX9-NEXT:    s_getpc_b64 s[18:19]
169; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i32_inreg@rel32@lo+4
170; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i32_inreg@rel32@hi+12
171; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
172; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
173; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
174; GFX9-NEXT:    s_mov_b32 s32, s33
175; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
176; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
177; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
178; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
179; GFX9-NEXT:    s_mov_b32 s33, s4
180; GFX9-NEXT:    s_waitcnt vmcnt(0)
181; GFX9-NEXT:    s_setpc_b64 s[30:31]
182;
183; GFX11-LABEL: test_call_external_void_func_i32_inreg:
184; GFX11:       ; %bb.0:
185; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GFX11-NEXT:    s_mov_b32 s1, s33
187; GFX11-NEXT:    s_mov_b32 s33, s32
188; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
189; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
190; GFX11-NEXT:    s_mov_b32 exec_lo, s2
191; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
192; GFX11-NEXT:    s_add_i32 s32, s32, 16
193; GFX11-NEXT:    s_getpc_b64 s[2:3]
194; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32_inreg@rel32@lo+4
195; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32_inreg@rel32@hi+12
196; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
197; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
198; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
199; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
200; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
201; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
202; GFX11-NEXT:    s_mov_b32 s32, s33
203; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
204; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
205; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
206; GFX11-NEXT:    s_mov_b32 exec_lo, s1
207; GFX11-NEXT:    s_mov_b32 s33, s0
208; GFX11-NEXT:    s_waitcnt vmcnt(0)
209; GFX11-NEXT:    s_setpc_b64 s[30:31]
210  call void @external_void_func_i32_inreg(i32 inreg %arg)
211  ret void
212}
213
214define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
215; GFX9-LABEL: test_call_external_void_func_i64_inreg:
216; GFX9:       ; %bb.0:
217; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX9-NEXT:    s_mov_b32 s18, s33
219; GFX9-NEXT:    s_mov_b32 s33, s32
220; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
221; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
222; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
223; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
224; GFX9-NEXT:    s_addk_i32 s32, 0x400
225; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
226; GFX9-NEXT:    s_mov_b32 s1, s17
227; GFX9-NEXT:    s_mov_b32 s0, s16
228; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
229; GFX9-NEXT:    s_getpc_b64 s[18:19]
230; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i64_inreg@rel32@lo+4
231; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg@rel32@hi+12
232; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
233; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
234; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
235; GFX9-NEXT:    s_mov_b32 s32, s33
236; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
237; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
238; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
239; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
240; GFX9-NEXT:    s_mov_b32 s33, s4
241; GFX9-NEXT:    s_waitcnt vmcnt(0)
242; GFX9-NEXT:    s_setpc_b64 s[30:31]
243;
244; GFX11-LABEL: test_call_external_void_func_i64_inreg:
245; GFX11:       ; %bb.0:
246; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX11-NEXT:    s_mov_b32 s2, s33
248; GFX11-NEXT:    s_mov_b32 s33, s32
249; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
250; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
251; GFX11-NEXT:    s_mov_b32 exec_lo, s3
252; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
253; GFX11-NEXT:    s_add_i32 s32, s32, 16
254; GFX11-NEXT:    s_getpc_b64 s[2:3]
255; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64_inreg@rel32@lo+4
256; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64_inreg@rel32@hi+12
257; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
258; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
259; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
260; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
261; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
262; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
263; GFX11-NEXT:    s_mov_b32 s32, s33
264; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
265; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
266; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
267; GFX11-NEXT:    s_mov_b32 exec_lo, s1
268; GFX11-NEXT:    s_mov_b32 s33, s0
269; GFX11-NEXT:    s_waitcnt vmcnt(0)
270; GFX11-NEXT:    s_setpc_b64 s[30:31]
271  call void @external_void_func_i64_inreg(i64 inreg %arg)
272  ret void
273}
274
275define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
276; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
277; GFX9:       ; %bb.0:
278; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279; GFX9-NEXT:    s_mov_b32 s18, s33
280; GFX9-NEXT:    s_mov_b32 s33, s32
281; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
282; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
283; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
284; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
285; GFX9-NEXT:    s_addk_i32 s32, 0x400
286; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
287; GFX9-NEXT:    s_mov_b32 s1, s17
288; GFX9-NEXT:    s_mov_b32 s0, s16
289; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
290; GFX9-NEXT:    s_getpc_b64 s[18:19]
291; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2i32_inreg@rel32@lo+4
292; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2i32_inreg@rel32@hi+12
293; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
294; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
295; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
296; GFX9-NEXT:    s_mov_b32 s32, s33
297; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
298; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
299; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
300; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
301; GFX9-NEXT:    s_mov_b32 s33, s4
302; GFX9-NEXT:    s_waitcnt vmcnt(0)
303; GFX9-NEXT:    s_setpc_b64 s[30:31]
304;
305; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
306; GFX11:       ; %bb.0:
307; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; GFX11-NEXT:    s_mov_b32 s2, s33
309; GFX11-NEXT:    s_mov_b32 s33, s32
310; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
311; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
312; GFX11-NEXT:    s_mov_b32 exec_lo, s3
313; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
314; GFX11-NEXT:    s_add_i32 s32, s32, 16
315; GFX11-NEXT:    s_getpc_b64 s[2:3]
316; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32_inreg@rel32@lo+4
317; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32_inreg@rel32@hi+12
318; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
319; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
320; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
321; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
322; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
323; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
324; GFX11-NEXT:    s_mov_b32 s32, s33
325; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
326; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
327; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
328; GFX11-NEXT:    s_mov_b32 exec_lo, s1
329; GFX11-NEXT:    s_mov_b32 s33, s0
330; GFX11-NEXT:    s_waitcnt vmcnt(0)
331; GFX11-NEXT:    s_setpc_b64 s[30:31]
332  call void @external_void_func_v2i32_inreg(<2 x i32> inreg %arg)
333  ret void
334}
335
336define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
337; GFX9-LABEL: test_call_external_void_func_v3i32_inreg:
338; GFX9:       ; %bb.0:
339; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
340; GFX9-NEXT:    s_mov_b32 s19, s33
341; GFX9-NEXT:    s_mov_b32 s33, s32
342; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
343; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
344; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
345; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
346; GFX9-NEXT:    s_addk_i32 s32, 0x400
347; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
348; GFX9-NEXT:    s_mov_b32 s2, s18
349; GFX9-NEXT:    s_mov_b32 s1, s17
350; GFX9-NEXT:    s_mov_b32 s0, s16
351; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
352; GFX9-NEXT:    s_getpc_b64 s[20:21]
353; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v3i32_inreg@rel32@lo+4
354; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v3i32_inreg@rel32@hi+12
355; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
356; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
357; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
358; GFX9-NEXT:    s_mov_b32 s32, s33
359; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
360; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
361; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
362; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
363; GFX9-NEXT:    s_mov_b32 s33, s4
364; GFX9-NEXT:    s_waitcnt vmcnt(0)
365; GFX9-NEXT:    s_setpc_b64 s[30:31]
366;
367; GFX11-LABEL: test_call_external_void_func_v3i32_inreg:
368; GFX11:       ; %bb.0:
369; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX11-NEXT:    s_mov_b32 s3, s33
371; GFX11-NEXT:    s_mov_b32 s33, s32
372; GFX11-NEXT:    s_or_saveexec_b32 s16, -1
373; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
374; GFX11-NEXT:    s_mov_b32 exec_lo, s16
375; GFX11-NEXT:    v_writelane_b32 v40, s3, 2
376; GFX11-NEXT:    s_add_i32 s32, s32, 16
377; GFX11-NEXT:    s_getpc_b64 s[16:17]
378; GFX11-NEXT:    s_add_u32 s16, s16, external_void_func_v3i32_inreg@rel32@lo+4
379; GFX11-NEXT:    s_addc_u32 s17, s17, external_void_func_v3i32_inreg@rel32@hi+12
380; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
381; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
382; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
383; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
384; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
385; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
386; GFX11-NEXT:    s_mov_b32 s32, s33
387; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
388; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
389; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
390; GFX11-NEXT:    s_mov_b32 exec_lo, s1
391; GFX11-NEXT:    s_mov_b32 s33, s0
392; GFX11-NEXT:    s_waitcnt vmcnt(0)
393; GFX11-NEXT:    s_setpc_b64 s[30:31]
394  call void @external_void_func_v3i32_inreg(<3 x i32> inreg %arg)
395  ret void
396}
397
398define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
399; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
400; GFX9:       ; %bb.0:
401; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402; GFX9-NEXT:    s_mov_b32 s20, s33
403; GFX9-NEXT:    s_mov_b32 s33, s32
404; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
405; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
406; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
407; GFX9-NEXT:    v_writelane_b32 v40, s20, 2
408; GFX9-NEXT:    s_addk_i32 s32, 0x400
409; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
410; GFX9-NEXT:    s_mov_b32 s3, s19
411; GFX9-NEXT:    s_mov_b32 s2, s18
412; GFX9-NEXT:    s_mov_b32 s1, s17
413; GFX9-NEXT:    s_mov_b32 s0, s16
414; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
415; GFX9-NEXT:    s_getpc_b64 s[20:21]
416; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v4i32_inreg@rel32@lo+4
417; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v4i32_inreg@rel32@hi+12
418; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
419; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
420; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
421; GFX9-NEXT:    s_mov_b32 s32, s33
422; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
423; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
424; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
425; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
426; GFX9-NEXT:    s_mov_b32 s33, s4
427; GFX9-NEXT:    s_waitcnt vmcnt(0)
428; GFX9-NEXT:    s_setpc_b64 s[30:31]
429;
430; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
431; GFX11:       ; %bb.0:
432; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433; GFX11-NEXT:    s_mov_b32 s16, s33
434; GFX11-NEXT:    s_mov_b32 s33, s32
435; GFX11-NEXT:    s_or_saveexec_b32 s17, -1
436; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
437; GFX11-NEXT:    s_mov_b32 exec_lo, s17
438; GFX11-NEXT:    v_writelane_b32 v40, s16, 2
439; GFX11-NEXT:    s_add_i32 s32, s32, 16
440; GFX11-NEXT:    s_getpc_b64 s[16:17]
441; GFX11-NEXT:    s_add_u32 s16, s16, external_void_func_v4i32_inreg@rel32@lo+4
442; GFX11-NEXT:    s_addc_u32 s17, s17, external_void_func_v4i32_inreg@rel32@hi+12
443; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
444; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
445; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
446; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
447; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
448; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
449; GFX11-NEXT:    s_mov_b32 s32, s33
450; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
451; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
452; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
453; GFX11-NEXT:    s_mov_b32 exec_lo, s1
454; GFX11-NEXT:    s_mov_b32 s33, s0
455; GFX11-NEXT:    s_waitcnt vmcnt(0)
456; GFX11-NEXT:    s_setpc_b64 s[30:31]
457  call void @external_void_func_v4i32_inreg(<4 x i32> inreg %arg)
458  ret void
459}
460
461define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
462; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
463; GFX9:       ; %bb.0:
464; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465; GFX9-NEXT:    s_mov_b32 s24, s33
466; GFX9-NEXT:    s_mov_b32 s33, s32
467; GFX9-NEXT:    s_or_saveexec_b64 s[26:27], -1
468; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
469; GFX9-NEXT:    s_mov_b64 exec, s[26:27]
470; GFX9-NEXT:    v_writelane_b32 v40, s24, 2
471; GFX9-NEXT:    s_addk_i32 s32, 0x400
472; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
473; GFX9-NEXT:    s_mov_b32 s3, s19
474; GFX9-NEXT:    s_mov_b32 s2, s18
475; GFX9-NEXT:    s_mov_b32 s1, s17
476; GFX9-NEXT:    s_mov_b32 s0, s16
477; GFX9-NEXT:    s_mov_b32 s16, s20
478; GFX9-NEXT:    s_mov_b32 s17, s21
479; GFX9-NEXT:    s_mov_b32 s18, s22
480; GFX9-NEXT:    s_mov_b32 s19, s23
481; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
482; GFX9-NEXT:    s_getpc_b64 s[24:25]
483; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_v8i32_inreg@rel32@lo+4
484; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_v8i32_inreg@rel32@hi+12
485; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
486; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
487; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
488; GFX9-NEXT:    s_mov_b32 s32, s33
489; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
490; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
491; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
492; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
493; GFX9-NEXT:    s_mov_b32 s33, s4
494; GFX9-NEXT:    s_waitcnt vmcnt(0)
495; GFX9-NEXT:    s_setpc_b64 s[30:31]
496;
497; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
498; GFX11:       ; %bb.0:
499; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500; GFX11-NEXT:    s_mov_b32 s20, s33
501; GFX11-NEXT:    s_mov_b32 s33, s32
502; GFX11-NEXT:    s_or_saveexec_b32 s21, -1
503; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
504; GFX11-NEXT:    s_mov_b32 exec_lo, s21
505; GFX11-NEXT:    v_writelane_b32 v40, s20, 2
506; GFX11-NEXT:    s_add_i32 s32, s32, 16
507; GFX11-NEXT:    s_getpc_b64 s[20:21]
508; GFX11-NEXT:    s_add_u32 s20, s20, external_void_func_v8i32_inreg@rel32@lo+4
509; GFX11-NEXT:    s_addc_u32 s21, s21, external_void_func_v8i32_inreg@rel32@hi+12
510; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
511; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
512; GFX11-NEXT:    s_swappc_b64 s[30:31], s[20:21]
513; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
514; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
515; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
516; GFX11-NEXT:    s_mov_b32 s32, s33
517; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
518; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
519; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
520; GFX11-NEXT:    s_mov_b32 exec_lo, s1
521; GFX11-NEXT:    s_mov_b32 s33, s0
522; GFX11-NEXT:    s_waitcnt vmcnt(0)
523; GFX11-NEXT:    s_setpc_b64 s[30:31]
524  call void @external_void_func_v8i32_inreg(<8 x i32> inreg %arg)
525  ret void
526}
527
528define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
529; GFX9-LABEL: test_call_external_void_func_f16_inreg:
530; GFX9:       ; %bb.0:
531; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
532; GFX9-NEXT:    s_mov_b32 s17, s33
533; GFX9-NEXT:    s_mov_b32 s33, s32
534; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
535; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
536; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
537; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
538; GFX9-NEXT:    s_addk_i32 s32, 0x400
539; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
540; GFX9-NEXT:    s_mov_b32 s0, s16
541; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
542; GFX9-NEXT:    s_getpc_b64 s[18:19]
543; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f16_inreg@rel32@lo+4
544; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f16_inreg@rel32@hi+12
545; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
546; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
547; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
548; GFX9-NEXT:    s_mov_b32 s32, s33
549; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
550; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
551; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
552; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
553; GFX9-NEXT:    s_mov_b32 s33, s4
554; GFX9-NEXT:    s_waitcnt vmcnt(0)
555; GFX9-NEXT:    s_setpc_b64 s[30:31]
556;
557; GFX11-LABEL: test_call_external_void_func_f16_inreg:
558; GFX11:       ; %bb.0:
559; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
560; GFX11-NEXT:    s_mov_b32 s1, s33
561; GFX11-NEXT:    s_mov_b32 s33, s32
562; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
563; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
564; GFX11-NEXT:    s_mov_b32 exec_lo, s2
565; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
566; GFX11-NEXT:    s_add_i32 s32, s32, 16
567; GFX11-NEXT:    s_getpc_b64 s[2:3]
568; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16_inreg@rel32@lo+4
569; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16_inreg@rel32@hi+12
570; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
571; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
572; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
573; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
574; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
575; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
576; GFX11-NEXT:    s_mov_b32 s32, s33
577; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
578; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
579; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
580; GFX11-NEXT:    s_mov_b32 exec_lo, s1
581; GFX11-NEXT:    s_mov_b32 s33, s0
582; GFX11-NEXT:    s_waitcnt vmcnt(0)
583; GFX11-NEXT:    s_setpc_b64 s[30:31]
584  call void @external_void_func_f16_inreg(half inreg %arg)
585  ret void
586}
587
588define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
589; GFX9-LABEL: test_call_external_void_func_bf16_inreg:
590; GFX9:       ; %bb.0:
591; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592; GFX9-NEXT:    s_mov_b32 s17, s33
593; GFX9-NEXT:    s_mov_b32 s33, s32
594; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
595; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
596; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
597; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
598; GFX9-NEXT:    s_addk_i32 s32, 0x400
599; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
600; GFX9-NEXT:    s_mov_b32 s0, s16
601; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
602; GFX9-NEXT:    s_getpc_b64 s[18:19]
603; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4
604; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12
605; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
606; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
607; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
608; GFX9-NEXT:    s_mov_b32 s32, s33
609; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
610; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
611; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
612; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
613; GFX9-NEXT:    s_mov_b32 s33, s4
614; GFX9-NEXT:    s_waitcnt vmcnt(0)
615; GFX9-NEXT:    s_setpc_b64 s[30:31]
616;
617; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
618; GFX11:       ; %bb.0:
619; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX11-NEXT:    s_mov_b32 s1, s33
621; GFX11-NEXT:    s_mov_b32 s33, s32
622; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
623; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
624; GFX11-NEXT:    s_mov_b32 exec_lo, s2
625; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
626; GFX11-NEXT:    s_add_i32 s32, s32, 16
627; GFX11-NEXT:    s_getpc_b64 s[2:3]
628; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_bf16_inreg@rel32@lo+4
629; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_bf16_inreg@rel32@hi+12
630; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
631; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
632; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
633; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
634; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
635; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
636; GFX11-NEXT:    s_mov_b32 s32, s33
637; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
638; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
639; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
640; GFX11-NEXT:    s_mov_b32 exec_lo, s1
641; GFX11-NEXT:    s_mov_b32 s33, s0
642; GFX11-NEXT:    s_waitcnt vmcnt(0)
643; GFX11-NEXT:    s_setpc_b64 s[30:31]
644  call void @external_void_func_bf16_inreg(bfloat inreg %arg)
645  ret void
646}
647
648define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
649; GFX9-LABEL: test_call_external_void_func_f32_inreg:
650; GFX9:       ; %bb.0:
651; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; GFX9-NEXT:    s_mov_b32 s17, s33
653; GFX9-NEXT:    s_mov_b32 s33, s32
654; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
655; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
656; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
657; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
658; GFX9-NEXT:    s_addk_i32 s32, 0x400
659; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
660; GFX9-NEXT:    s_mov_b32 s0, s16
661; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
662; GFX9-NEXT:    s_getpc_b64 s[18:19]
663; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f32_inreg@rel32@lo+4
664; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f32_inreg@rel32@hi+12
665; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
666; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
667; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
668; GFX9-NEXT:    s_mov_b32 s32, s33
669; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
670; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
671; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
672; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
673; GFX9-NEXT:    s_mov_b32 s33, s4
674; GFX9-NEXT:    s_waitcnt vmcnt(0)
675; GFX9-NEXT:    s_setpc_b64 s[30:31]
676;
677; GFX11-LABEL: test_call_external_void_func_f32_inreg:
678; GFX11:       ; %bb.0:
679; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
680; GFX11-NEXT:    s_mov_b32 s1, s33
681; GFX11-NEXT:    s_mov_b32 s33, s32
682; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
683; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
684; GFX11-NEXT:    s_mov_b32 exec_lo, s2
685; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
686; GFX11-NEXT:    s_add_i32 s32, s32, 16
687; GFX11-NEXT:    s_getpc_b64 s[2:3]
688; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32_inreg@rel32@lo+4
689; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32_inreg@rel32@hi+12
690; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
691; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
692; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
693; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
694; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
695; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
696; GFX11-NEXT:    s_mov_b32 s32, s33
697; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
698; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
699; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
700; GFX11-NEXT:    s_mov_b32 exec_lo, s1
701; GFX11-NEXT:    s_mov_b32 s33, s0
702; GFX11-NEXT:    s_waitcnt vmcnt(0)
703; GFX11-NEXT:    s_setpc_b64 s[30:31]
704  call void @external_void_func_f32_inreg(float inreg %arg)
705  ret void
706}
707
708define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
709; GFX9-LABEL: test_call_external_void_func_f64_inreg:
710; GFX9:       ; %bb.0:
711; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
712; GFX9-NEXT:    s_mov_b32 s18, s33
713; GFX9-NEXT:    s_mov_b32 s33, s32
714; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
715; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
716; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
717; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
718; GFX9-NEXT:    s_addk_i32 s32, 0x400
719; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
720; GFX9-NEXT:    s_mov_b32 s1, s17
721; GFX9-NEXT:    s_mov_b32 s0, s16
722; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
723; GFX9-NEXT:    s_getpc_b64 s[18:19]
724; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f64_inreg@rel32@lo+4
725; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f64_inreg@rel32@hi+12
726; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
727; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
728; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
729; GFX9-NEXT:    s_mov_b32 s32, s33
730; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
731; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
732; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
733; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
734; GFX9-NEXT:    s_mov_b32 s33, s4
735; GFX9-NEXT:    s_waitcnt vmcnt(0)
736; GFX9-NEXT:    s_setpc_b64 s[30:31]
737;
738; GFX11-LABEL: test_call_external_void_func_f64_inreg:
739; GFX11:       ; %bb.0:
740; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
741; GFX11-NEXT:    s_mov_b32 s2, s33
742; GFX11-NEXT:    s_mov_b32 s33, s32
743; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
744; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
745; GFX11-NEXT:    s_mov_b32 exec_lo, s3
746; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
747; GFX11-NEXT:    s_add_i32 s32, s32, 16
748; GFX11-NEXT:    s_getpc_b64 s[2:3]
749; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f64_inreg@rel32@lo+4
750; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f64_inreg@rel32@hi+12
751; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
752; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
753; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
754; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
755; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
756; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
757; GFX11-NEXT:    s_mov_b32 s32, s33
758; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
759; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
760; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
761; GFX11-NEXT:    s_mov_b32 exec_lo, s1
762; GFX11-NEXT:    s_mov_b32 s33, s0
763; GFX11-NEXT:    s_waitcnt vmcnt(0)
764; GFX11-NEXT:    s_setpc_b64 s[30:31]
765  call void @external_void_func_f64_inreg(double inreg %arg)
766  ret void
767}
768
769define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 {
770; GFX9-LABEL: test_call_external_void_func_v2f16_inreg:
771; GFX9:       ; %bb.0:
772; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
773; GFX9-NEXT:    s_mov_b32 s17, s33
774; GFX9-NEXT:    s_mov_b32 s33, s32
775; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
776; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
777; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
778; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
779; GFX9-NEXT:    s_addk_i32 s32, 0x400
780; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
781; GFX9-NEXT:    s_mov_b32 s0, s16
782; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
783; GFX9-NEXT:    s_getpc_b64 s[18:19]
784; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2f16_inreg@rel32@lo+4
785; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2f16_inreg@rel32@hi+12
786; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
787; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
788; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
789; GFX9-NEXT:    s_mov_b32 s32, s33
790; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
791; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
792; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
793; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
794; GFX9-NEXT:    s_mov_b32 s33, s4
795; GFX9-NEXT:    s_waitcnt vmcnt(0)
796; GFX9-NEXT:    s_setpc_b64 s[30:31]
797;
798; GFX11-LABEL: test_call_external_void_func_v2f16_inreg:
799; GFX11:       ; %bb.0:
800; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
801; GFX11-NEXT:    s_mov_b32 s1, s33
802; GFX11-NEXT:    s_mov_b32 s33, s32
803; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
804; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
805; GFX11-NEXT:    s_mov_b32 exec_lo, s2
806; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
807; GFX11-NEXT:    s_add_i32 s32, s32, 16
808; GFX11-NEXT:    s_getpc_b64 s[2:3]
809; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f16_inreg@rel32@lo+4
810; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f16_inreg@rel32@hi+12
811; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
812; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
813; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
814; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
815; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
816; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
817; GFX11-NEXT:    s_mov_b32 s32, s33
818; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
819; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
820; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
821; GFX11-NEXT:    s_mov_b32 exec_lo, s1
822; GFX11-NEXT:    s_mov_b32 s33, s0
823; GFX11-NEXT:    s_waitcnt vmcnt(0)
824; GFX11-NEXT:    s_setpc_b64 s[30:31]
825  call void @external_void_func_v2f16_inreg(<2 x half> inreg %arg)
826  ret void
827}
828
829
830define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) #0 {
831; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg:
832; GFX9:       ; %bb.0:
833; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834; GFX9-NEXT:    s_mov_b32 s17, s33
835; GFX9-NEXT:    s_mov_b32 s33, s32
836; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
837; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
838; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
839; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
840; GFX9-NEXT:    s_addk_i32 s32, 0x400
841; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
842; GFX9-NEXT:    s_mov_b32 s0, s16
843; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
844; GFX9-NEXT:    s_getpc_b64 s[18:19]
845; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4
846; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12
847; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
848; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
849; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
850; GFX9-NEXT:    s_mov_b32 s32, s33
851; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
852; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
853; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
854; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
855; GFX9-NEXT:    s_mov_b32 s33, s4
856; GFX9-NEXT:    s_waitcnt vmcnt(0)
857; GFX9-NEXT:    s_setpc_b64 s[30:31]
858;
859; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
860; GFX11:       ; %bb.0:
861; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
862; GFX11-NEXT:    s_mov_b32 s1, s33
863; GFX11-NEXT:    s_mov_b32 s33, s32
864; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
865; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
866; GFX11-NEXT:    s_mov_b32 exec_lo, s2
867; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
868; GFX11-NEXT:    s_add_i32 s32, s32, 16
869; GFX11-NEXT:    s_getpc_b64 s[2:3]
870; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2bf16_inreg@rel32@lo+4
871; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2bf16_inreg@rel32@hi+12
872; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
873; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
874; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
875; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
876; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
877; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
878; GFX11-NEXT:    s_mov_b32 s32, s33
879; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
880; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
881; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
882; GFX11-NEXT:    s_mov_b32 exec_lo, s1
883; GFX11-NEXT:    s_mov_b32 s33, s0
884; GFX11-NEXT:    s_waitcnt vmcnt(0)
885; GFX11-NEXT:    s_setpc_b64 s[30:31]
886  call void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
887  ret void
888}
889
890define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 {
891; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
892; GFX9:       ; %bb.0:
893; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
894; GFX9-NEXT:    s_mov_b32 s18, s33
895; GFX9-NEXT:    s_mov_b32 s33, s32
896; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
897; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
898; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
899; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
900; GFX9-NEXT:    s_addk_i32 s32, 0x400
901; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
902; GFX9-NEXT:    s_mov_b32 s1, s17
903; GFX9-NEXT:    s_mov_b32 s0, s16
904; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
905; GFX9-NEXT:    s_getpc_b64 s[18:19]
906; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3f16_inreg@rel32@lo+4
907; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3f16_inreg@rel32@hi+12
908; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
909; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
910; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
911; GFX9-NEXT:    s_mov_b32 s32, s33
912; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
913; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
914; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
915; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
916; GFX9-NEXT:    s_mov_b32 s33, s4
917; GFX9-NEXT:    s_waitcnt vmcnt(0)
918; GFX9-NEXT:    s_setpc_b64 s[30:31]
919;
920; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
921; GFX11:       ; %bb.0:
922; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
923; GFX11-NEXT:    s_mov_b32 s2, s33
924; GFX11-NEXT:    s_mov_b32 s33, s32
925; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
926; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
927; GFX11-NEXT:    s_mov_b32 exec_lo, s3
928; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
929; GFX11-NEXT:    s_add_i32 s32, s32, 16
930; GFX11-NEXT:    s_getpc_b64 s[2:3]
931; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16_inreg@rel32@lo+4
932; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16_inreg@rel32@hi+12
933; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
934; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
935; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
936; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
937; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
938; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
939; GFX11-NEXT:    s_mov_b32 s32, s33
940; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
941; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
942; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
943; GFX11-NEXT:    s_mov_b32 exec_lo, s1
944; GFX11-NEXT:    s_mov_b32 s33, s0
945; GFX11-NEXT:    s_waitcnt vmcnt(0)
946; GFX11-NEXT:    s_setpc_b64 s[30:31]
947  call void @external_void_func_v3f16_inreg(<3 x half> inreg %arg)
948  ret void
949}
950
951define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 {
952; GFX9-LABEL: test_call_external_void_func_v4f16_inreg:
953; GFX9:       ; %bb.0:
954; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
955; GFX9-NEXT:    s_mov_b32 s18, s33
956; GFX9-NEXT:    s_mov_b32 s33, s32
957; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
958; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
959; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
960; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
961; GFX9-NEXT:    s_addk_i32 s32, 0x400
962; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
963; GFX9-NEXT:    s_mov_b32 s1, s17
964; GFX9-NEXT:    s_mov_b32 s0, s16
965; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
966; GFX9-NEXT:    s_getpc_b64 s[18:19]
967; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4f16_inreg@rel32@lo+4
968; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4f16_inreg@rel32@hi+12
969; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
970; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
971; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
972; GFX9-NEXT:    s_mov_b32 s32, s33
973; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
974; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
975; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
976; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
977; GFX9-NEXT:    s_mov_b32 s33, s4
978; GFX9-NEXT:    s_waitcnt vmcnt(0)
979; GFX9-NEXT:    s_setpc_b64 s[30:31]
980;
981; GFX11-LABEL: test_call_external_void_func_v4f16_inreg:
982; GFX11:       ; %bb.0:
983; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
984; GFX11-NEXT:    s_mov_b32 s2, s33
985; GFX11-NEXT:    s_mov_b32 s33, s32
986; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
987; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
988; GFX11-NEXT:    s_mov_b32 exec_lo, s3
989; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
990; GFX11-NEXT:    s_add_i32 s32, s32, 16
991; GFX11-NEXT:    s_getpc_b64 s[2:3]
992; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4f16_inreg@rel32@lo+4
993; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4f16_inreg@rel32@hi+12
994; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
995; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
996; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
997; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
998; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
999; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1000; GFX11-NEXT:    s_mov_b32 s32, s33
1001; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1002; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1003; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1004; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1005; GFX11-NEXT:    s_mov_b32 s33, s0
1006; GFX11-NEXT:    s_waitcnt vmcnt(0)
1007; GFX11-NEXT:    s_setpc_b64 s[30:31]
1008  call void @external_void_func_v4f16_inreg(<4 x half> inreg %arg)
1009  ret void
1010}
1011
1012define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
1013; GFX9-LABEL: test_call_external_void_func_p0_inreg:
1014; GFX9:       ; %bb.0:
1015; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016; GFX9-NEXT:    s_mov_b32 s18, s33
1017; GFX9-NEXT:    s_mov_b32 s33, s32
1018; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
1019; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1020; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
1021; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
1022; GFX9-NEXT:    s_addk_i32 s32, 0x400
1023; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
1024; GFX9-NEXT:    s_mov_b32 s1, s17
1025; GFX9-NEXT:    s_mov_b32 s0, s16
1026; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
1027; GFX9-NEXT:    s_getpc_b64 s[18:19]
1028; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p0_inreg@rel32@lo+4
1029; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p0_inreg@rel32@hi+12
1030; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
1031; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
1032; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
1033; GFX9-NEXT:    s_mov_b32 s32, s33
1034; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
1035; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
1036; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
1037; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
1038; GFX9-NEXT:    s_mov_b32 s33, s4
1039; GFX9-NEXT:    s_waitcnt vmcnt(0)
1040; GFX9-NEXT:    s_setpc_b64 s[30:31]
1041;
1042; GFX11-LABEL: test_call_external_void_func_p0_inreg:
1043; GFX11:       ; %bb.0:
1044; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1045; GFX11-NEXT:    s_mov_b32 s2, s33
1046; GFX11-NEXT:    s_mov_b32 s33, s32
1047; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
1048; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
1049; GFX11-NEXT:    s_mov_b32 exec_lo, s3
1050; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
1051; GFX11-NEXT:    s_add_i32 s32, s32, 16
1052; GFX11-NEXT:    s_getpc_b64 s[2:3]
1053; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_p0_inreg@rel32@lo+4
1054; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_p0_inreg@rel32@hi+12
1055; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
1056; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
1057; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
1058; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1059; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
1060; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1061; GFX11-NEXT:    s_mov_b32 s32, s33
1062; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1063; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1064; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1065; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1066; GFX11-NEXT:    s_mov_b32 s33, s0
1067; GFX11-NEXT:    s_waitcnt vmcnt(0)
1068; GFX11-NEXT:    s_setpc_b64 s[30:31]
1069  call void @external_void_func_p0_inreg(ptr inreg %arg)
1070  ret void
1071}
1072
1073define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 {
1074; GFX9-LABEL: test_call_external_void_func_p1_inreg:
1075; GFX9:       ; %bb.0:
1076; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1077; GFX9-NEXT:    s_mov_b32 s18, s33
1078; GFX9-NEXT:    s_mov_b32 s33, s32
1079; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
1080; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1081; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
1082; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
1083; GFX9-NEXT:    s_addk_i32 s32, 0x400
1084; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
1085; GFX9-NEXT:    s_mov_b32 s1, s17
1086; GFX9-NEXT:    s_mov_b32 s0, s16
1087; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
1088; GFX9-NEXT:    s_getpc_b64 s[18:19]
1089; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p1_inreg@rel32@lo+4
1090; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p1_inreg@rel32@hi+12
1091; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
1092; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
1093; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
1094; GFX9-NEXT:    s_mov_b32 s32, s33
1095; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
1096; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
1097; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
1098; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
1099; GFX9-NEXT:    s_mov_b32 s33, s4
1100; GFX9-NEXT:    s_waitcnt vmcnt(0)
1101; GFX9-NEXT:    s_setpc_b64 s[30:31]
1102;
1103; GFX11-LABEL: test_call_external_void_func_p1_inreg:
1104; GFX11:       ; %bb.0:
1105; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1106; GFX11-NEXT:    s_mov_b32 s2, s33
1107; GFX11-NEXT:    s_mov_b32 s33, s32
1108; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
1109; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
1110; GFX11-NEXT:    s_mov_b32 exec_lo, s3
1111; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
1112; GFX11-NEXT:    s_add_i32 s32, s32, 16
1113; GFX11-NEXT:    s_getpc_b64 s[2:3]
1114; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_p1_inreg@rel32@lo+4
1115; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_p1_inreg@rel32@hi+12
1116; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
1117; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
1118; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
1119; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1120; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
1121; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1122; GFX11-NEXT:    s_mov_b32 s32, s33
1123; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1124; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1125; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1126; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1127; GFX11-NEXT:    s_mov_b32 s33, s0
1128; GFX11-NEXT:    s_waitcnt vmcnt(0)
1129; GFX11-NEXT:    s_setpc_b64 s[30:31]
1130  call void @external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
1131  ret void
1132}
1133
1134define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) #0 {
1135; GFX9-LABEL: test_call_external_void_func_p3_inreg:
1136; GFX9:       ; %bb.0:
1137; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1138; GFX9-NEXT:    s_mov_b32 s17, s33
1139; GFX9-NEXT:    s_mov_b32 s33, s32
1140; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
1141; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1142; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
1143; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
1144; GFX9-NEXT:    s_addk_i32 s32, 0x400
1145; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
1146; GFX9-NEXT:    s_mov_b32 s0, s16
1147; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
1148; GFX9-NEXT:    s_getpc_b64 s[18:19]
1149; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p3_inreg@rel32@lo+4
1150; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p3_inreg@rel32@hi+12
1151; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
1152; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
1153; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
1154; GFX9-NEXT:    s_mov_b32 s32, s33
1155; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
1156; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
1157; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
1158; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
1159; GFX9-NEXT:    s_mov_b32 s33, s4
1160; GFX9-NEXT:    s_waitcnt vmcnt(0)
1161; GFX9-NEXT:    s_setpc_b64 s[30:31]
1162;
1163; GFX11-LABEL: test_call_external_void_func_p3_inreg:
1164; GFX11:       ; %bb.0:
1165; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166; GFX11-NEXT:    s_mov_b32 s1, s33
1167; GFX11-NEXT:    s_mov_b32 s33, s32
1168; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
1169; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
1170; GFX11-NEXT:    s_mov_b32 exec_lo, s2
1171; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
1172; GFX11-NEXT:    s_add_i32 s32, s32, 16
1173; GFX11-NEXT:    s_getpc_b64 s[2:3]
1174; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_p3_inreg@rel32@lo+4
1175; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_p3_inreg@rel32@hi+12
1176; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
1177; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
1178; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
1179; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1180; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
1181; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1182; GFX11-NEXT:    s_mov_b32 s32, s33
1183; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1184; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1185; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1186; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1187; GFX11-NEXT:    s_mov_b32 s33, s0
1188; GFX11-NEXT:    s_waitcnt vmcnt(0)
1189; GFX11-NEXT:    s_setpc_b64 s[30:31]
1190  call void @external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
1191  ret void
1192}
1193
1194define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 {
1195; GFX9-LABEL: test_call_external_void_func_v2p1_inreg:
1196; GFX9:       ; %bb.0:
1197; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1198; GFX9-NEXT:    s_mov_b32 s20, s33
1199; GFX9-NEXT:    s_mov_b32 s33, s32
1200; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
1201; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1202; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
1203; GFX9-NEXT:    v_writelane_b32 v40, s20, 2
1204; GFX9-NEXT:    s_addk_i32 s32, 0x400
1205; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
1206; GFX9-NEXT:    s_mov_b32 s3, s19
1207; GFX9-NEXT:    s_mov_b32 s2, s18
1208; GFX9-NEXT:    s_mov_b32 s1, s17
1209; GFX9-NEXT:    s_mov_b32 s0, s16
1210; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
1211; GFX9-NEXT:    s_getpc_b64 s[20:21]
1212; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v2p1_inreg@rel32@lo+4
1213; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v2p1_inreg@rel32@hi+12
1214; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
1215; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
1216; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
1217; GFX9-NEXT:    s_mov_b32 s32, s33
1218; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
1219; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
1220; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
1221; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
1222; GFX9-NEXT:    s_mov_b32 s33, s4
1223; GFX9-NEXT:    s_waitcnt vmcnt(0)
1224; GFX9-NEXT:    s_setpc_b64 s[30:31]
1225;
1226; GFX11-LABEL: test_call_external_void_func_v2p1_inreg:
1227; GFX11:       ; %bb.0:
1228; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1229; GFX11-NEXT:    s_mov_b32 s16, s33
1230; GFX11-NEXT:    s_mov_b32 s33, s32
1231; GFX11-NEXT:    s_or_saveexec_b32 s17, -1
1232; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
1233; GFX11-NEXT:    s_mov_b32 exec_lo, s17
1234; GFX11-NEXT:    v_writelane_b32 v40, s16, 2
1235; GFX11-NEXT:    s_add_i32 s32, s32, 16
1236; GFX11-NEXT:    s_getpc_b64 s[16:17]
1237; GFX11-NEXT:    s_add_u32 s16, s16, external_void_func_v2p1_inreg@rel32@lo+4
1238; GFX11-NEXT:    s_addc_u32 s17, s17, external_void_func_v2p1_inreg@rel32@hi+12
1239; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
1240; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
1241; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
1242; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1243; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
1244; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1245; GFX11-NEXT:    s_mov_b32 s32, s33
1246; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1247; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1248; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1249; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1250; GFX11-NEXT:    s_mov_b32 s33, s0
1251; GFX11-NEXT:    s_waitcnt vmcnt(0)
1252; GFX11-NEXT:    s_setpc_b64 s[30:31]
1253  call void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg)
1254  ret void
1255}
1256
1257define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 {
1258; GFX9-LABEL: test_call_external_void_func_v2p5_inreg:
1259; GFX9:       ; %bb.0:
1260; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1261; GFX9-NEXT:    s_mov_b32 s18, s33
1262; GFX9-NEXT:    s_mov_b32 s33, s32
1263; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
1264; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1265; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
1266; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
1267; GFX9-NEXT:    s_addk_i32 s32, 0x400
1268; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
1269; GFX9-NEXT:    s_mov_b32 s1, s17
1270; GFX9-NEXT:    s_mov_b32 s0, s16
1271; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
1272; GFX9-NEXT:    s_getpc_b64 s[18:19]
1273; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p5_inreg@rel32@lo+4
1274; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p5_inreg@rel32@hi+12
1275; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
1276; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
1277; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
1278; GFX9-NEXT:    s_mov_b32 s32, s33
1279; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
1280; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
1281; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
1282; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
1283; GFX9-NEXT:    s_mov_b32 s33, s4
1284; GFX9-NEXT:    s_waitcnt vmcnt(0)
1285; GFX9-NEXT:    s_setpc_b64 s[30:31]
1286;
1287; GFX11-LABEL: test_call_external_void_func_v2p5_inreg:
1288; GFX11:       ; %bb.0:
1289; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1290; GFX11-NEXT:    s_mov_b32 s2, s33
1291; GFX11-NEXT:    s_mov_b32 s33, s32
1292; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
1293; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
1294; GFX11-NEXT:    s_mov_b32 exec_lo, s3
1295; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
1296; GFX11-NEXT:    s_add_i32 s32, s32, 16
1297; GFX11-NEXT:    s_getpc_b64 s[2:3]
1298; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2p5_inreg@rel32@lo+4
1299; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2p5_inreg@rel32@hi+12
1300; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
1301; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
1302; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
1303; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1304; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
1305; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1306; GFX11-NEXT:    s_mov_b32 s32, s33
1307; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1308; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1309; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1310; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1311; GFX11-NEXT:    s_mov_b32 s33, s0
1312; GFX11-NEXT:    s_waitcnt vmcnt(0)
1313; GFX11-NEXT:    s_setpc_b64 s[30:31]
1314  call void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg)
1315  ret void
1316}
1317
1318define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2) #0 {
1319; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
1320; GFX9:       ; %bb.0:
1321; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1322; GFX9-NEXT:    s_mov_b32 s21, s33
1323; GFX9-NEXT:    s_mov_b32 s33, s32
1324; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
1325; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1326; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
1327; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
1328; GFX9-NEXT:    s_addk_i32 s32, 0x400
1329; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
1330; GFX9-NEXT:    s_mov_b32 s3, s19
1331; GFX9-NEXT:    s_mov_b32 s2, s18
1332; GFX9-NEXT:    s_mov_b32 s1, s17
1333; GFX9-NEXT:    s_mov_b32 s0, s16
1334; GFX9-NEXT:    s_mov_b32 s16, s20
1335; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
1336; GFX9-NEXT:    s_getpc_b64 s[22:23]
1337; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4
1338; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12
1339; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
1340; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
1341; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
1342; GFX9-NEXT:    s_mov_b32 s32, s33
1343; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
1344; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
1345; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
1346; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
1347; GFX9-NEXT:    s_mov_b32 s33, s4
1348; GFX9-NEXT:    s_waitcnt vmcnt(0)
1349; GFX9-NEXT:    s_setpc_b64 s[30:31]
1350;
1351; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
1352; GFX11:       ; %bb.0:
1353; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1354; GFX11-NEXT:    s_mov_b32 s17, s33
1355; GFX11-NEXT:    s_mov_b32 s33, s32
1356; GFX11-NEXT:    s_or_saveexec_b32 s18, -1
1357; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
1358; GFX11-NEXT:    s_mov_b32 exec_lo, s18
1359; GFX11-NEXT:    v_writelane_b32 v40, s17, 2
1360; GFX11-NEXT:    s_add_i32 s32, s32, 16
1361; GFX11-NEXT:    s_getpc_b64 s[18:19]
1362; GFX11-NEXT:    s_add_u32 s18, s18, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4
1363; GFX11-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12
1364; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
1365; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
1366; GFX11-NEXT:    s_swappc_b64 s[30:31], s[18:19]
1367; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1368; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
1369; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1370; GFX11-NEXT:    s_mov_b32 s32, s33
1371; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1372; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1373; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1374; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1375; GFX11-NEXT:    s_mov_b32 s33, s0
1376; GFX11-NEXT:    s_waitcnt vmcnt(0)
1377; GFX11-NEXT:    s_setpc_b64 s[30:31]
1378  call void @external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2)
1379  ret void
1380}
1381
1382define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #0 {
1383; GFX9-LABEL: test_call_external_void_func_a15i32_inreg:
1384; GFX9:       ; %bb.0:
1385; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1386; GFX9-NEXT:    s_mov_b32 s29, s33
1387; GFX9-NEXT:    s_mov_b32 s33, s32
1388; GFX9-NEXT:    s_or_saveexec_b64 vcc, -1
1389; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1390; GFX9-NEXT:    s_mov_b64 exec, vcc
1391; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
1392; GFX9-NEXT:    s_addk_i32 s32, 0x400
1393; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
1394; GFX9-NEXT:    s_mov_b32 s3, s19
1395; GFX9-NEXT:    s_mov_b32 s2, s18
1396; GFX9-NEXT:    s_mov_b32 s1, s17
1397; GFX9-NEXT:    s_mov_b32 s0, s16
1398; GFX9-NEXT:    s_mov_b32 s16, s20
1399; GFX9-NEXT:    s_mov_b32 s17, s21
1400; GFX9-NEXT:    s_mov_b32 s18, s22
1401; GFX9-NEXT:    s_mov_b32 s19, s23
1402; GFX9-NEXT:    s_mov_b32 s20, s24
1403; GFX9-NEXT:    s_mov_b32 s21, s25
1404; GFX9-NEXT:    s_mov_b32 s22, s26
1405; GFX9-NEXT:    s_mov_b32 s23, s27
1406; GFX9-NEXT:    s_mov_b32 s24, s28
1407; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
1408; GFX9-NEXT:    s_getpc_b64 vcc
1409; GFX9-NEXT:    s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg@rel32@lo+4
1410; GFX9-NEXT:    s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg@rel32@hi+12
1411; GFX9-NEXT:    s_swappc_b64 s[30:31], vcc
1412; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
1413; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
1414; GFX9-NEXT:    s_mov_b32 s32, s33
1415; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
1416; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
1417; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
1418; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
1419; GFX9-NEXT:    s_mov_b32 s33, s4
1420; GFX9-NEXT:    s_waitcnt vmcnt(0)
1421; GFX9-NEXT:    s_setpc_b64 s[30:31]
1422;
1423; GFX11-LABEL: test_call_external_void_func_a15i32_inreg:
1424; GFX11:       ; %bb.0:
1425; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1426; GFX11-NEXT:    s_mov_b32 s25, s33
1427; GFX11-NEXT:    s_mov_b32 s33, s32
1428; GFX11-NEXT:    s_or_saveexec_b32 s26, -1
1429; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
1430; GFX11-NEXT:    s_mov_b32 exec_lo, s26
1431; GFX11-NEXT:    v_writelane_b32 v40, s25, 2
1432; GFX11-NEXT:    s_add_i32 s32, s32, 16
1433; GFX11-NEXT:    s_getpc_b64 s[26:27]
1434; GFX11-NEXT:    s_add_u32 s26, s26, external_void_func_a15i32_inreg@rel32@lo+4
1435; GFX11-NEXT:    s_addc_u32 s27, s27, external_void_func_a15i32_inreg@rel32@hi+12
1436; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
1437; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
1438; GFX11-NEXT:    s_swappc_b64 s[30:31], s[26:27]
1439; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1440; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
1441; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1442; GFX11-NEXT:    s_mov_b32 s32, s33
1443; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1444; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1445; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1446; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1447; GFX11-NEXT:    s_mov_b32 s33, s0
1448; GFX11-NEXT:    s_waitcnt vmcnt(0)
1449; GFX11-NEXT:    s_setpc_b64 s[30:31]
1450  call void @external_void_func_a15i32_inreg([13 x i32] inreg %arg0)
1451  ret void
1452}
1453
1454
1455; FIXME: This should also fail
1456define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inreg %arg0, i32 inreg %arg1) #1 {
1457; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
1458; GFX9:       ; %bb.0:
1459; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1460; GFX9-NEXT:    s_mov_b32 s21, s33
1461; GFX9-NEXT:    s_mov_b32 s33, s32
1462; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
1463; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1464; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
1465; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
1466; GFX9-NEXT:    s_addk_i32 s32, 0x400
1467; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
1468; GFX9-NEXT:    s_mov_b32 s3, s7
1469; GFX9-NEXT:    s_mov_b32 s2, s6
1470; GFX9-NEXT:    s_mov_b32 s1, s5
1471; GFX9-NEXT:    s_mov_b32 s0, s4
1472; GFX9-NEXT:    s_mov_b32 s4, s8
1473; GFX9-NEXT:    s_mov_b32 s5, s9
1474; GFX9-NEXT:    s_mov_b32 s6, s10
1475; GFX9-NEXT:    s_mov_b32 s7, s11
1476; GFX9-NEXT:    s_mov_b32 s8, s15
1477; GFX9-NEXT:    s_mov_b32 s9, s16
1478; GFX9-NEXT:    s_mov_b32 s10, s17
1479; GFX9-NEXT:    s_mov_b32 s11, s18
1480; GFX9-NEXT:    s_mov_b32 s15, s19
1481; GFX9-NEXT:    s_mov_b32 s16, s20
1482; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
1483; GFX9-NEXT:    s_getpc_b64 s[22:23]
1484; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
1485; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
1486; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
1487; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
1488; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
1489; GFX9-NEXT:    s_mov_b32 s32, s33
1490; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
1491; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
1492; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
1493; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
1494; GFX9-NEXT:    s_mov_b32 s33, s4
1495; GFX9-NEXT:    s_waitcnt vmcnt(0)
1496; GFX9-NEXT:    s_setpc_b64 s[30:31]
1497;
1498; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
1499; GFX11:       ; %bb.0:
1500; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1501; GFX11-NEXT:    s_mov_b32 s17, s33
1502; GFX11-NEXT:    s_mov_b32 s33, s32
1503; GFX11-NEXT:    s_or_saveexec_b32 s18, -1
1504; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
1505; GFX11-NEXT:    s_mov_b32 exec_lo, s18
1506; GFX11-NEXT:    v_writelane_b32 v40, s17, 2
1507; GFX11-NEXT:    s_add_i32 s32, s32, 16
1508; GFX11-NEXT:    s_getpc_b64 s[18:19]
1509; GFX11-NEXT:    s_add_u32 s18, s18, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
1510; GFX11-NEXT:    s_addc_u32 s19, s19, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
1511; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
1512; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
1513; GFX11-NEXT:    s_swappc_b64 s[30:31], s[18:19]
1514; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1515; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
1516; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
1517; GFX11-NEXT:    s_mov_b32 s32, s33
1518; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
1519; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
1520; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
1521; GFX11-NEXT:    s_mov_b32 exec_lo, s1
1522; GFX11-NEXT:    s_mov_b32 s33, s0
1523; GFX11-NEXT:    s_waitcnt vmcnt(0)
1524; GFX11-NEXT:    s_setpc_b64 s[30:31]
1525  call void @external_void_func_a15i32_inreg_i32_inreg__noimplicit([13 x i32] inreg %arg0, i32 inreg %arg1)
1526  ret void
1527}
1528
1529attributes #0 = { nounwind }
1530attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
1531
1532!llvm.module.flags = !{!0}
1533!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
1534