xref: /llvm-project/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
5
6declare void @extern_func() #2
7
8define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
9; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be
10; preserved across the call and should get 8 scratch registers.
11; GFX9-LABEL: non_preserved_vgpr_tuple8:
12; GFX9:       ; %bb.0: ; %main_body
13; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; GFX9-NEXT:    s_mov_b32 s4, s33
15; GFX9-NEXT:    s_mov_b32 s33, s32
16; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
17; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
18; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
19; GFX9-NEXT:    v_mov_b32_e32 v36, v16
20; GFX9-NEXT:    v_mov_b32_e32 v35, v15
21; GFX9-NEXT:    v_mov_b32_e32 v34, v14
22; GFX9-NEXT:    v_mov_b32_e32 v33, v13
23; GFX9-NEXT:    v_mov_b32_e32 v32, v12
24; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
25; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
26; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
27; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
28; GFX9-NEXT:    ;;#ASMSTART
29; GFX9-NEXT:    ;;#ASMEND
30; GFX9-NEXT:    ;;#ASMSTART
31; GFX9-NEXT:    ;;#ASMEND
32; GFX9-NEXT:    ;;#ASMSTART
33; GFX9-NEXT:    ;;#ASMEND
34; GFX9-NEXT:    ;;#ASMSTART
35; GFX9-NEXT:    ;;#ASMEND
36; GFX9-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1
37; GFX9-NEXT:    s_addk_i32 s32, 0x800
38; GFX9-NEXT:    v_writelane_b32 v44, s4, 2
39; GFX9-NEXT:    s_getpc_b64 s[4:5]
40; GFX9-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
41; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
42; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
43; GFX9-NEXT:    v_writelane_b32 v44, s30, 0
44; GFX9-NEXT:    v_writelane_b32 v44, s31, 1
45; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
47; GFX9-NEXT:    v_mov_b32_e32 v0, v40
48; GFX9-NEXT:    v_mov_b32_e32 v1, v41
49; GFX9-NEXT:    v_mov_b32_e32 v2, v42
50; GFX9-NEXT:    v_mov_b32_e32 v3, v43
51; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
52; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
53; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
54; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
55; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
56; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
57; GFX9-NEXT:    s_mov_b32 s32, s33
58; GFX9-NEXT:    v_readlane_b32 s4, v44, 2
59; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
60; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
61; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
62; GFX9-NEXT:    s_mov_b32 s33, s4
63; GFX9-NEXT:    s_waitcnt vmcnt(0)
64; GFX9-NEXT:    s_setpc_b64 s[30:31]
65;
66; GFX10-LABEL: non_preserved_vgpr_tuple8:
67; GFX10:       ; %bb.0: ; %main_body
68; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69; GFX10-NEXT:    s_mov_b32 s4, s33
70; GFX10-NEXT:    s_mov_b32 s33, s32
71; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
72; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
73; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
74; GFX10-NEXT:    s_mov_b32 exec_lo, s5
75; GFX10-NEXT:    v_mov_b32_e32 v36, v16
76; GFX10-NEXT:    v_mov_b32_e32 v35, v15
77; GFX10-NEXT:    v_mov_b32_e32 v34, v14
78; GFX10-NEXT:    v_mov_b32_e32 v33, v13
79; GFX10-NEXT:    v_mov_b32_e32 v32, v12
80; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
81; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
82; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
83; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
84; GFX10-NEXT:    ;;#ASMSTART
85; GFX10-NEXT:    ;;#ASMEND
86; GFX10-NEXT:    ;;#ASMSTART
87; GFX10-NEXT:    ;;#ASMEND
88; GFX10-NEXT:    ;;#ASMSTART
89; GFX10-NEXT:    ;;#ASMEND
90; GFX10-NEXT:    ;;#ASMSTART
91; GFX10-NEXT:    ;;#ASMEND
92; GFX10-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
93; GFX10-NEXT:    s_addk_i32 s32, 0x400
94; GFX10-NEXT:    v_writelane_b32 v44, s4, 2
95; GFX10-NEXT:    s_getpc_b64 s[4:5]
96; GFX10-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
97; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
98; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
99; GFX10-NEXT:    v_writelane_b32 v44, s30, 0
100; GFX10-NEXT:    v_writelane_b32 v44, s31, 1
101; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
103; GFX10-NEXT:    v_mov_b32_e32 v0, v40
104; GFX10-NEXT:    v_mov_b32_e32 v1, v41
105; GFX10-NEXT:    v_mov_b32_e32 v2, v42
106; GFX10-NEXT:    v_mov_b32_e32 v3, v43
107; GFX10-NEXT:    s_clause 0x3
108; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33
109; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
110; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
111; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
112; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
113; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
114; GFX10-NEXT:    s_mov_b32 s32, s33
115; GFX10-NEXT:    v_readlane_b32 s4, v44, 2
116; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
117; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
118; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
119; GFX10-NEXT:    s_mov_b32 exec_lo, s5
120; GFX10-NEXT:    s_mov_b32 s33, s4
121; GFX10-NEXT:    s_waitcnt vmcnt(0)
122; GFX10-NEXT:    s_setpc_b64 s[30:31]
123;
124; GFX11-LABEL: non_preserved_vgpr_tuple8:
125; GFX11:       ; %bb.0: ; %main_body
126; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127; GFX11-NEXT:    s_mov_b32 s0, s33
128; GFX11-NEXT:    s_mov_b32 s33, s32
129; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
130; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
131; GFX11-NEXT:    s_mov_b32 exec_lo, s1
132; GFX11-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
133; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
134; GFX11-NEXT:    v_mov_b32_e32 v32, v12
135; GFX11-NEXT:    s_clause 0x3
136; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:12
137; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:8
138; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:4
139; GFX11-NEXT:    scratch_store_b32 off, v43, s33
140; GFX11-NEXT:    ;;#ASMSTART
141; GFX11-NEXT:    ;;#ASMEND
142; GFX11-NEXT:    ;;#ASMSTART
143; GFX11-NEXT:    ;;#ASMEND
144; GFX11-NEXT:    ;;#ASMSTART
145; GFX11-NEXT:    ;;#ASMEND
146; GFX11-NEXT:    ;;#ASMSTART
147; GFX11-NEXT:    ;;#ASMEND
148; GFX11-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
149; GFX11-NEXT:    s_add_i32 s32, s32, 32
150; GFX11-NEXT:    v_writelane_b32 v44, s0, 2
151; GFX11-NEXT:    s_getpc_b64 s[0:1]
152; GFX11-NEXT:    s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
153; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
154; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
155; GFX11-NEXT:    v_writelane_b32 v44, s30, 0
156; GFX11-NEXT:    v_writelane_b32 v44, s31, 1
157; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
158; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
159; GFX11-NEXT:    v_dual_mov_b32 v0, v40 :: v_dual_mov_b32 v1, v41
160; GFX11-NEXT:    v_dual_mov_b32 v2, v42 :: v_dual_mov_b32 v3, v43
161; GFX11-NEXT:    s_clause 0x3
162; GFX11-NEXT:    scratch_load_b32 v43, off, s33
163; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
164; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
165; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
166; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
167; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
168; GFX11-NEXT:    s_mov_b32 s32, s33
169; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
170; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
171; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
172; GFX11-NEXT:    s_mov_b32 exec_lo, s1
173; GFX11-NEXT:    s_mov_b32 s33, s0
174; GFX11-NEXT:    s_waitcnt vmcnt(0)
175; GFX11-NEXT:    s_setpc_b64 s[30:31]
176
177
178
179
180
181
182
183
184
185
186main_body:
187  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
188  call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
189  call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
190  call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
191  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
192  call void @extern_func()
193  ret <4 x float> %v
194}
195
196define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
197; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved
198; across the call and should get allcoated to 8 CSRs.
199; Only the lower 5 sub-registers of the tuple are preserved.
200; The upper 3 sub-registers are unused.
201; GFX9-LABEL: call_preserved_vgpr_tuple8:
202; GFX9:       ; %bb.0: ; %main_body
203; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204; GFX9-NEXT:    s_mov_b32 s4, s33
205; GFX9-NEXT:    s_mov_b32 s33, s32
206; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
207; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
208; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
209; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
210; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
211; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
212; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
213; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
214; GFX9-NEXT:    v_mov_b32_e32 v44, v16
215; GFX9-NEXT:    v_mov_b32_e32 v43, v15
216; GFX9-NEXT:    v_mov_b32_e32 v42, v14
217; GFX9-NEXT:    v_mov_b32_e32 v41, v13
218; GFX9-NEXT:    v_mov_b32_e32 v40, v12
219; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1
220; GFX9-NEXT:    s_addk_i32 s32, 0x800
221; GFX9-NEXT:    v_writelane_b32 v45, s4, 2
222; GFX9-NEXT:    s_getpc_b64 s[4:5]
223; GFX9-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
224; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
225; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
226; GFX9-NEXT:    v_writelane_b32 v45, s30, 0
227; GFX9-NEXT:    v_writelane_b32 v45, s31, 1
228; GFX9-NEXT:    s_waitcnt vmcnt(0)
229; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
230; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
232; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1
233; GFX9-NEXT:    s_nop 0
234; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
235; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
236; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
237; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
238; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
239; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
240; GFX9-NEXT:    v_readlane_b32 s30, v45, 0
241; GFX9-NEXT:    s_mov_b32 s32, s33
242; GFX9-NEXT:    v_readlane_b32 s4, v45, 2
243; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
244; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
245; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
246; GFX9-NEXT:    s_mov_b32 s33, s4
247; GFX9-NEXT:    s_waitcnt vmcnt(0)
248; GFX9-NEXT:    s_setpc_b64 s[30:31]
249;
250; GFX10-LABEL: call_preserved_vgpr_tuple8:
251; GFX10:       ; %bb.0: ; %main_body
252; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253; GFX10-NEXT:    s_mov_b32 s4, s33
254; GFX10-NEXT:    s_mov_b32 s33, s32
255; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
256; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
257; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
258; GFX10-NEXT:    s_mov_b32 exec_lo, s5
259; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
260; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
261; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
262; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
263; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
264; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
265; GFX10-NEXT:    s_addk_i32 s32, 0x400
266; GFX10-NEXT:    v_writelane_b32 v45, s4, 2
267; GFX10-NEXT:    s_getpc_b64 s[4:5]
268; GFX10-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
269; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
270; GFX10-NEXT:    v_mov_b32_e32 v40, v16
271; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
272; GFX10-NEXT:    v_mov_b32_e32 v41, v15
273; GFX10-NEXT:    v_writelane_b32 v45, s30, 0
274; GFX10-NEXT:    v_mov_b32_e32 v42, v14
275; GFX10-NEXT:    v_mov_b32_e32 v43, v13
276; GFX10-NEXT:    v_mov_b32_e32 v44, v12
277; GFX10-NEXT:    v_writelane_b32 v45, s31, 1
278; GFX10-NEXT:    s_waitcnt vmcnt(0)
279; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
280; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
282; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
283; GFX10-NEXT:    s_clause 0x4
284; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
285; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
286; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
287; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
288; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
289; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
290; GFX10-NEXT:    v_readlane_b32 s30, v45, 0
291; GFX10-NEXT:    s_mov_b32 s32, s33
292; GFX10-NEXT:    v_readlane_b32 s4, v45, 2
293; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
294; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
295; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
296; GFX10-NEXT:    s_mov_b32 exec_lo, s5
297; GFX10-NEXT:    s_mov_b32 s33, s4
298; GFX10-NEXT:    s_waitcnt vmcnt(0)
299; GFX10-NEXT:    s_setpc_b64 s[30:31]
300;
301; GFX11-LABEL: call_preserved_vgpr_tuple8:
302; GFX11:       ; %bb.0: ; %main_body
303; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GFX11-NEXT:    s_mov_b32 s0, s33
305; GFX11-NEXT:    s_mov_b32 s33, s32
306; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
307; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:20 ; 4-byte Folded Spill
308; GFX11-NEXT:    s_mov_b32 exec_lo, s1
309; GFX11-NEXT:    s_clause 0x4
310; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16
311; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
312; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
313; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
314; GFX11-NEXT:    scratch_store_b32 off, v44, s33
315; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
316; GFX11-NEXT:    s_add_i32 s32, s32, 32
317; GFX11-NEXT:    v_writelane_b32 v45, s0, 2
318; GFX11-NEXT:    s_getpc_b64 s[0:1]
319; GFX11-NEXT:    s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
320; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
321; GFX11-NEXT:    v_dual_mov_b32 v40, v16 :: v_dual_mov_b32 v41, v15
322; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
323; GFX11-NEXT:    v_writelane_b32 v45, s30, 0
324; GFX11-NEXT:    v_dual_mov_b32 v42, v14 :: v_dual_mov_b32 v43, v13
325; GFX11-NEXT:    v_mov_b32_e32 v44, v12
326; GFX11-NEXT:    v_writelane_b32 v45, s31, 1
327; GFX11-NEXT:    s_waitcnt vmcnt(0)
328; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
329; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
331; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
332; GFX11-NEXT:    s_clause 0x4
333; GFX11-NEXT:    scratch_load_b32 v44, off, s33
334; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
335; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
336; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
337; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
338; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
339; GFX11-NEXT:    v_readlane_b32 s30, v45, 0
340; GFX11-NEXT:    s_mov_b32 s32, s33
341; GFX11-NEXT:    v_readlane_b32 s0, v45, 2
342; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
343; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:20 ; 4-byte Folded Reload
344; GFX11-NEXT:    s_mov_b32 exec_lo, s1
345; GFX11-NEXT:    s_mov_b32 s33, s0
346; GFX11-NEXT:    s_waitcnt vmcnt(0)
347; GFX11-NEXT:    s_setpc_b64 s[30:31]
348
349
350
351
352
353
354
355main_body:
356  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
357  store <4 x float> %v, ptr addrspace(1) undef
358  call void @extern_func()
359  %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
360  ret <4 x float> %v1
361}
362
363declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
364
365attributes #0 = { nounwind writeonly }
366attributes #1 = { nounwind readonly }
367attributes #2 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
368