xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (revision 335620853117153e52ce54fe4e879f66aa23ff99)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
3; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
4; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
5; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
6
7define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
8; GISEL12-LABEL: basic:
9; GISEL12:       ; %bb.0: ; %entry
10; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
11; GISEL12-NEXT:    s_wait_expcnt 0x0
12; GISEL12-NEXT:    s_wait_samplecnt 0x0
13; GISEL12-NEXT:    s_wait_bvhcnt 0x0
14; GISEL12-NEXT:    s_wait_kmcnt 0x0
15; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
16; GISEL12-NEXT:    s_mov_b32 s6, s3
17; GISEL12-NEXT:    s_mov_b32 s7, s4
18; GISEL12-NEXT:    s_wait_alu 0xfffe
19; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
20; GISEL12-NEXT:  ; %bb.1: ; %shader
21; GISEL12-NEXT:    v_add_nc_u32_e32 v12, 42, v12
22; GISEL12-NEXT:    v_add_nc_u32_e32 v8, 5, v8
23; GISEL12-NEXT:  ; %bb.2: ; %tail
24; GISEL12-NEXT:    s_wait_alu 0xfffe
25; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
26; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
27; GISEL12-NEXT:    v_add_nc_u32_e32 v11, 32, v12
28; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
29; GISEL12-NEXT:    s_wait_alu 0xfffe
30; GISEL12-NEXT:    s_setpc_b64 s[6:7]
31;
32; DAGISEL12-LABEL: basic:
33; DAGISEL12:       ; %bb.0: ; %entry
34; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
35; DAGISEL12-NEXT:    s_wait_expcnt 0x0
36; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
37; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
38; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
39; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
40; DAGISEL12-NEXT:    s_mov_b32 s7, s4
41; DAGISEL12-NEXT:    s_mov_b32 s6, s3
42; DAGISEL12-NEXT:    s_wait_alu 0xfffe
43; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
44; DAGISEL12-NEXT:  ; %bb.1: ; %shader
45; DAGISEL12-NEXT:    v_add_nc_u32_e32 v12, 42, v12
46; DAGISEL12-NEXT:    v_add_nc_u32_e32 v8, 5, v8
47; DAGISEL12-NEXT:  ; %bb.2: ; %tail
48; DAGISEL12-NEXT:    s_wait_alu 0xfffe
49; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
50; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
51; DAGISEL12-NEXT:    v_add_nc_u32_e32 v11, 32, v12
52; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
53; DAGISEL12-NEXT:    s_wait_alu 0xfffe
54; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
55;
56; GISEL10-LABEL: basic:
57; GISEL10:       ; %bb.0: ; %entry
58; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
60; GISEL10-NEXT:    s_mov_b32 s6, s3
61; GISEL10-NEXT:    s_mov_b32 s7, s4
62; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
63; GISEL10-NEXT:  ; %bb.1: ; %shader
64; GISEL10-NEXT:    v_add_nc_u32_e32 v12, 42, v12
65; GISEL10-NEXT:    v_add_nc_u32_e32 v8, 5, v8
66; GISEL10-NEXT:  ; %bb.2: ; %tail
67; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
68; GISEL10-NEXT:    v_add_nc_u32_e32 v11, 32, v12
69; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
70; GISEL10-NEXT:    s_setpc_b64 s[6:7]
71;
72; DAGISEL10-LABEL: basic:
73; DAGISEL10:       ; %bb.0: ; %entry
74; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; DAGISEL10-NEXT:    s_or_saveexec_b32 s8, -1
76; DAGISEL10-NEXT:    s_mov_b32 s7, s4
77; DAGISEL10-NEXT:    s_mov_b32 s6, s3
78; DAGISEL10-NEXT:    s_and_saveexec_b32 s3, s8
79; DAGISEL10-NEXT:  ; %bb.1: ; %shader
80; DAGISEL10-NEXT:    v_add_nc_u32_e32 v12, 42, v12
81; DAGISEL10-NEXT:    v_add_nc_u32_e32 v8, 5, v8
82; DAGISEL10-NEXT:  ; %bb.2: ; %tail
83; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
84; DAGISEL10-NEXT:    v_add_nc_u32_e32 v11, 32, v12
85; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s5
86; DAGISEL10-NEXT:    s_setpc_b64 s[6:7]
87entry:
88  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
89  br i1 %entry_exec, label %shader, label %tail
90
91shader:
92  %newx = add i32 %x, 42
93  %oldval = extractvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, 0
94  %newval = add i32 %oldval, 5
95  %newvgpr = insertvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %newval, 0
96
97  br label %tail
98
99tail:
100  %full.x = phi i32 [%x, %entry], [%newx, %shader]
101  %full.vgpr = phi { i32, ptr addrspace(5), i32, i32 } [%vgpr, %entry], [%newvgpr, %shader]
102  %modified.x = add i32 %full.x, 32
103  %vgpr.args = insertvalue { i32, ptr addrspace(5), i32, i32 } %full.vgpr, i32 %modified.x, 3
104  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
105  unreachable
106}
107
108define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
109; GISEL12-LABEL: wwm_in_shader:
110; GISEL12:       ; %bb.0: ; %entry
111; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
112; GISEL12-NEXT:    s_wait_expcnt 0x0
113; GISEL12-NEXT:    s_wait_samplecnt 0x0
114; GISEL12-NEXT:    s_wait_bvhcnt 0x0
115; GISEL12-NEXT:    s_wait_kmcnt 0x0
116; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
117; GISEL12-NEXT:    v_dual_mov_b32 v10, v12 :: v_dual_mov_b32 v11, v13
118; GISEL12-NEXT:    s_mov_b32 s6, s3
119; GISEL12-NEXT:    s_mov_b32 s7, s4
120; GISEL12-NEXT:    s_wait_alu 0xfffe
121; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
122; GISEL12-NEXT:  ; %bb.1: ; %shader
123; GISEL12-NEXT:    s_or_saveexec_b32 s4, -1
124; GISEL12-NEXT:    s_wait_alu 0xfffe
125; GISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
126; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
127; GISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
128; GISEL12-NEXT:    v_mov_b32_e32 v0, s8
129; GISEL12-NEXT:    s_mov_b32 exec_lo, s4
130; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
131; GISEL12-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10
132; GISEL12-NEXT:  ; %bb.2: ; %tail
133; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
134; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
135; GISEL12-NEXT:    s_wait_alu 0xfffe
136; GISEL12-NEXT:    s_setpc_b64 s[6:7]
137;
138; DAGISEL12-LABEL: wwm_in_shader:
139; DAGISEL12:       ; %bb.0: ; %entry
140; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
141; DAGISEL12-NEXT:    s_wait_expcnt 0x0
142; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
143; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
144; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
145; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
146; DAGISEL12-NEXT:    v_dual_mov_b32 v11, v13 :: v_dual_mov_b32 v10, v12
147; DAGISEL12-NEXT:    s_mov_b32 s7, s4
148; DAGISEL12-NEXT:    s_mov_b32 s6, s3
149; DAGISEL12-NEXT:    s_wait_alu 0xfffe
150; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
151; DAGISEL12-NEXT:  ; %bb.1: ; %shader
152; DAGISEL12-NEXT:    s_or_saveexec_b32 s4, -1
153; DAGISEL12-NEXT:    s_wait_alu 0xfffe
154; DAGISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
155; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
156; DAGISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
157; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s4
158; DAGISEL12-NEXT:    v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10
159; DAGISEL12-NEXT:  ; %bb.2: ; %tail
160; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
161; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
162; DAGISEL12-NEXT:    s_wait_alu 0xfffe
163; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
164;
165; GISEL10-LABEL: wwm_in_shader:
166; GISEL10:       ; %bb.0: ; %entry
167; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
169; GISEL10-NEXT:    v_mov_b32_e32 v10, v12
170; GISEL10-NEXT:    v_mov_b32_e32 v11, v13
171; GISEL10-NEXT:    s_mov_b32 s6, s3
172; GISEL10-NEXT:    s_mov_b32 s7, s4
173; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
174; GISEL10-NEXT:  ; %bb.1: ; %shader
175; GISEL10-NEXT:    s_or_saveexec_b32 s4, -1
176; GISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
177; GISEL10-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
178; GISEL10-NEXT:    v_mov_b32_e32 v0, s8
179; GISEL10-NEXT:    s_mov_b32 exec_lo, s4
180; GISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v10
181; GISEL10-NEXT:    v_mov_b32_e32 v11, v0
182; GISEL10-NEXT:  ; %bb.2: ; %tail
183; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
184; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
185; GISEL10-NEXT:    s_setpc_b64 s[6:7]
186;
187; DAGISEL10-LABEL: wwm_in_shader:
188; DAGISEL10:       ; %bb.0: ; %entry
189; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; DAGISEL10-NEXT:    s_or_saveexec_b32 s8, -1
191; DAGISEL10-NEXT:    v_mov_b32_e32 v11, v13
192; DAGISEL10-NEXT:    v_mov_b32_e32 v10, v12
193; DAGISEL10-NEXT:    s_mov_b32 s7, s4
194; DAGISEL10-NEXT:    s_mov_b32 s6, s3
195; DAGISEL10-NEXT:    s_and_saveexec_b32 s3, s8
196; DAGISEL10-NEXT:  ; %bb.1: ; %shader
197; DAGISEL10-NEXT:    s_or_saveexec_b32 s4, -1
198; DAGISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
199; DAGISEL10-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
200; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s4
201; DAGISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v10
202; DAGISEL10-NEXT:    v_mov_b32_e32 v11, s8
203; DAGISEL10-NEXT:  ; %bb.2: ; %tail
204; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
205; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s5
206; DAGISEL10-NEXT:    s_setpc_b64 s[6:7]
207entry:
208  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
209  br i1 %entry_exec, label %shader, label %tail
210
211shader:
212  %nonwwm = add i32 %x, 42
213
214  %full.vgpr = call i32 @llvm.amdgcn.set.inactive.i32(i32 %x, i32 71)
215  %non.zero = icmp ne i32 %full.vgpr, 0
216  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %non.zero)
217  %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %ballot)
218
219  br label %tail
220
221tail:
222  %full.nonwwm = phi i32 [%x, %entry], [%nonwwm, %shader]
223  %full.wwm = phi i32 [%y, %entry], [%wwm, %shader]
224  %vgpr.1 = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr, i32 %full.nonwwm, 2
225  %vgpr.2 = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr.1, i32 %full.wwm, 3
226  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.2, i32 0)
227  unreachable
228}
229
230define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
231; GISEL12-LABEL: phi_whole_struct:
232; GISEL12:       ; %bb.0: ; %entry
233; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
234; GISEL12-NEXT:    s_wait_expcnt 0x0
235; GISEL12-NEXT:    s_wait_samplecnt 0x0
236; GISEL12-NEXT:    s_wait_bvhcnt 0x0
237; GISEL12-NEXT:    s_wait_kmcnt 0x0
238; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
239; GISEL12-NEXT:    s_mov_b32 s6, s3
240; GISEL12-NEXT:    s_mov_b32 s7, s4
241; GISEL12-NEXT:    s_wait_alu 0xfffe
242; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
243; GISEL12-NEXT:  ; %bb.1: ; %shader
244; GISEL12-NEXT:    s_or_saveexec_b32 s4, -1
245; GISEL12-NEXT:    s_wait_alu 0xfffe
246; GISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v12, s4
247; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
248; GISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
249; GISEL12-NEXT:    v_mov_b32_e32 v0, s8
250; GISEL12-NEXT:    s_mov_b32 exec_lo, s4
251; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
252; GISEL12-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v12
253; GISEL12-NEXT:  ; %bb.2: ; %tail
254; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
255; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
256; GISEL12-NEXT:    s_wait_alu 0xfffe
257; GISEL12-NEXT:    s_setpc_b64 s[6:7]
258;
259; DAGISEL12-LABEL: phi_whole_struct:
260; DAGISEL12:       ; %bb.0: ; %entry
261; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
262; DAGISEL12-NEXT:    s_wait_expcnt 0x0
263; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
264; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
265; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
266; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
267; DAGISEL12-NEXT:    s_mov_b32 s7, s4
268; DAGISEL12-NEXT:    s_mov_b32 s6, s3
269; DAGISEL12-NEXT:    s_wait_alu 0xfffe
270; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
271; DAGISEL12-NEXT:  ; %bb.1: ; %shader
272; DAGISEL12-NEXT:    s_or_saveexec_b32 s4, -1
273; DAGISEL12-NEXT:    s_wait_alu 0xfffe
274; DAGISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v12, s4
275; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
276; DAGISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
277; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s4
278; DAGISEL12-NEXT:    v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v12
279; DAGISEL12-NEXT:  ; %bb.2: ; %tail
280; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
281; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
282; DAGISEL12-NEXT:    s_wait_alu 0xfffe
283; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
284;
285; GISEL10-LABEL: phi_whole_struct:
286; GISEL10:       ; %bb.0: ; %entry
287; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
289; GISEL10-NEXT:    s_mov_b32 s6, s3
290; GISEL10-NEXT:    s_mov_b32 s7, s4
291; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
292; GISEL10-NEXT:  ; %bb.1: ; %shader
293; GISEL10-NEXT:    s_or_saveexec_b32 s4, -1
294; GISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v12, s4
295; GISEL10-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
296; GISEL10-NEXT:    v_mov_b32_e32 v0, s8
297; GISEL10-NEXT:    s_mov_b32 exec_lo, s4
298; GISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v12
299; GISEL10-NEXT:    v_mov_b32_e32 v11, v0
300; GISEL10-NEXT:  ; %bb.2: ; %tail
301; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
302; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
303; GISEL10-NEXT:    s_setpc_b64 s[6:7]
304;
305; DAGISEL10-LABEL: phi_whole_struct:
306; DAGISEL10:       ; %bb.0: ; %entry
307; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; DAGISEL10-NEXT:    s_or_saveexec_b32 s8, -1
309; DAGISEL10-NEXT:    s_mov_b32 s7, s4
310; DAGISEL10-NEXT:    s_mov_b32 s6, s3
311; DAGISEL10-NEXT:    s_and_saveexec_b32 s3, s8
312; DAGISEL10-NEXT:  ; %bb.1: ; %shader
313; DAGISEL10-NEXT:    s_or_saveexec_b32 s4, -1
314; DAGISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v12, s4
315; DAGISEL10-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
316; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s4
317; DAGISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v12
318; DAGISEL10-NEXT:    v_mov_b32_e32 v11, s8
319; DAGISEL10-NEXT:  ; %bb.2: ; %tail
320; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
321; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s5
322; DAGISEL10-NEXT:    s_setpc_b64 s[6:7]
323entry:
324  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
325  br i1 %entry_exec, label %shader, label %tail
326
327shader:
328  %nonwwm = add i32 %x, 42
329  %vgpr.1 = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr, i32 %nonwwm, 2
330
331  %full.vgpr = call i32 @llvm.amdgcn.set.inactive.i32(i32 %x, i32 71)
332  %non.zero = icmp ne i32 %full.vgpr, 0
333  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %non.zero)
334  %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %ballot)
335  %vgpr.2 = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr.1, i32 %wwm, 3
336
337  br label %tail
338
339tail:
340  %vgpr.args = phi { i32, ptr addrspace(5), i32, i32} [%vgpr, %entry], [%vgpr.2, %shader]
341  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
342  unreachable
343}
344
345; Introduce more complex control flow - %shader contains a simple loop, and %tail contains an if.
346define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
347; GISEL12-LABEL: control_flow:
348; GISEL12:       ; %bb.0: ; %entry
349; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
350; GISEL12-NEXT:    s_wait_expcnt 0x0
351; GISEL12-NEXT:    s_wait_samplecnt 0x0
352; GISEL12-NEXT:    s_wait_bvhcnt 0x0
353; GISEL12-NEXT:    s_wait_kmcnt 0x0
354; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
355; GISEL12-NEXT:    s_mov_b32 s6, s3
356; GISEL12-NEXT:    s_mov_b32 s7, s4
357; GISEL12-NEXT:    s_wait_alu 0xfffe
358; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
359; GISEL12-NEXT:    s_cbranch_execz .LBB3_4
360; GISEL12-NEXT:  ; %bb.1: ; %shader.preheader
361; GISEL12-NEXT:    v_add_nc_u32_e32 v1, -1, v12
362; GISEL12-NEXT:    s_mov_b32 s4, 0
363; GISEL12-NEXT:  .LBB3_2: ; %shader
364; GISEL12-NEXT:    ; =>This Inner Loop Header: Depth=1
365; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
366; GISEL12-NEXT:    v_add_nc_u32_e32 v1, 1, v1
367; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
368; GISEL12-NEXT:    s_wait_alu 0xfffe
369; GISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v1, s8
370; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
371; GISEL12-NEXT:    v_cmp_ne_u32_e64 s9, 0, v0
372; GISEL12-NEXT:    v_mov_b32_e32 v0, s9
373; GISEL12-NEXT:    s_mov_b32 exec_lo, s8
374; GISEL12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v13, v1
375; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
376; GISEL12-NEXT:    v_mov_b32_e32 v11, v0
377; GISEL12-NEXT:    s_or_b32 s4, vcc_lo, s4
378; GISEL12-NEXT:    s_wait_alu 0xfffe
379; GISEL12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
380; GISEL12-NEXT:    s_cbranch_execnz .LBB3_2
381; GISEL12-NEXT:  ; %bb.3: ; %tail.loopexit
382; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
383; GISEL12-NEXT:    v_add_nc_u32_e32 v10, 42, v1
384; GISEL12-NEXT:  .LBB3_4: ; %Flow1
385; GISEL12-NEXT:    s_wait_alu 0xfffe
386; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
387; GISEL12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
388; GISEL12-NEXT:    s_mov_b32 s3, exec_lo
389; GISEL12-NEXT:    ; implicit-def: $vgpr8
390; GISEL12-NEXT:    v_cmpx_lt_i32_e64 v12, v13
391; GISEL12-NEXT:    s_wait_alu 0xfffe
392; GISEL12-NEXT:    s_xor_b32 s3, exec_lo, s3
393; GISEL12-NEXT:  ; %bb.5: ; %tail.else
394; GISEL12-NEXT:    s_or_saveexec_b32 s4, -1
395; GISEL12-NEXT:    v_mov_b32_e32 v0, 15
396; GISEL12-NEXT:    s_wait_alu 0xfffe
397; GISEL12-NEXT:    s_mov_b32 exec_lo, s4
398; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
399; GISEL12-NEXT:    v_mov_b32_e32 v8, v0
400; GISEL12-NEXT:  ; %bb.6: ; %Flow
401; GISEL12-NEXT:    s_and_not1_saveexec_b32 s3, s3
402; GISEL12-NEXT:  ; %bb.7: ; %tail.then
403; GISEL12-NEXT:    s_mov_b32 s4, 44
404; GISEL12-NEXT:    s_wait_alu 0xfffe
405; GISEL12-NEXT:    v_mov_b32_e32 v8, s4
406; GISEL12-NEXT:  ; %bb.8: ; %tail.end
407; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
408; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
409; GISEL12-NEXT:    s_wait_alu 0xfffe
410; GISEL12-NEXT:    s_setpc_b64 s[6:7]
411;
412; DAGISEL12-LABEL: control_flow:
413; DAGISEL12:       ; %bb.0: ; %entry
414; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
415; DAGISEL12-NEXT:    s_wait_expcnt 0x0
416; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
417; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
418; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
419; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
420; DAGISEL12-NEXT:    s_mov_b32 s7, s4
421; DAGISEL12-NEXT:    s_mov_b32 s6, s3
422; DAGISEL12-NEXT:    s_wait_alu 0xfffe
423; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
424; DAGISEL12-NEXT:    s_cbranch_execz .LBB3_4
425; DAGISEL12-NEXT:  ; %bb.1: ; %shader.preheader
426; DAGISEL12-NEXT:    v_add_nc_u32_e32 v1, -1, v12
427; DAGISEL12-NEXT:    s_mov_b32 s4, 0
428; DAGISEL12-NEXT:  .LBB3_2: ; %shader
429; DAGISEL12-NEXT:    ; =>This Inner Loop Header: Depth=1
430; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
431; DAGISEL12-NEXT:    v_add_nc_u32_e32 v1, 1, v1
432; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
433; DAGISEL12-NEXT:    s_wait_alu 0xfffe
434; DAGISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v1, s8
435; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
436; DAGISEL12-NEXT:    v_cmp_ne_u32_e64 s9, 0, v0
437; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s8
438; DAGISEL12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v13, v1
439; DAGISEL12-NEXT:    v_mov_b32_e32 v11, s9
440; DAGISEL12-NEXT:    s_or_b32 s4, vcc_lo, s4
441; DAGISEL12-NEXT:    s_wait_alu 0xfffe
442; DAGISEL12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
443; DAGISEL12-NEXT:    s_cbranch_execnz .LBB3_2
444; DAGISEL12-NEXT:  ; %bb.3: ; %tail.loopexit
445; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
446; DAGISEL12-NEXT:    v_add_nc_u32_e32 v10, 42, v1
447; DAGISEL12-NEXT:  .LBB3_4: ; %Flow1
448; DAGISEL12-NEXT:    s_wait_alu 0xfffe
449; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
450; DAGISEL12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
451; DAGISEL12-NEXT:    s_mov_b32 s3, exec_lo
452; DAGISEL12-NEXT:    ; implicit-def: $vgpr8
453; DAGISEL12-NEXT:    v_cmpx_lt_i32_e64 v12, v13
454; DAGISEL12-NEXT:    s_wait_alu 0xfffe
455; DAGISEL12-NEXT:    s_xor_b32 s3, exec_lo, s3
456; DAGISEL12-NEXT:  ; %bb.5: ; %tail.else
457; DAGISEL12-NEXT:    s_mov_b32 s4, 15
458; DAGISEL12-NEXT:    s_wait_alu 0xfffe
459; DAGISEL12-NEXT:    v_mov_b32_e32 v8, s4
460; DAGISEL12-NEXT:  ; %bb.6: ; %Flow
461; DAGISEL12-NEXT:    s_and_not1_saveexec_b32 s3, s3
462; DAGISEL12-NEXT:  ; %bb.7: ; %tail.then
463; DAGISEL12-NEXT:    v_mov_b32_e32 v8, 44
464; DAGISEL12-NEXT:  ; %bb.8: ; %tail.end
465; DAGISEL12-NEXT:    s_wait_alu 0xfffe
466; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
467; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
468; DAGISEL12-NEXT:    s_wait_alu 0xfffe
469; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
470;
471; GISEL10-LABEL: control_flow:
472; GISEL10:       ; %bb.0: ; %entry
473; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
475; GISEL10-NEXT:    s_mov_b32 s6, s3
476; GISEL10-NEXT:    s_mov_b32 s7, s4
477; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
478; GISEL10-NEXT:    s_cbranch_execz .LBB3_4
479; GISEL10-NEXT:  ; %bb.1: ; %shader.preheader
480; GISEL10-NEXT:    v_add_nc_u32_e32 v1, -1, v12
481; GISEL10-NEXT:    s_mov_b32 s4, 0
482; GISEL10-NEXT:  .LBB3_2: ; %shader
483; GISEL10-NEXT:    ; =>This Inner Loop Header: Depth=1
484; GISEL10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
485; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
486; GISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v1, s8
487; GISEL10-NEXT:    v_cmp_ne_u32_e64 s9, 0, v0
488; GISEL10-NEXT:    v_mov_b32_e32 v0, s9
489; GISEL10-NEXT:    s_mov_b32 exec_lo, s8
490; GISEL10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v13, v1
491; GISEL10-NEXT:    v_mov_b32_e32 v11, v0
492; GISEL10-NEXT:    s_or_b32 s4, vcc_lo, s4
493; GISEL10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
494; GISEL10-NEXT:    s_cbranch_execnz .LBB3_2
495; GISEL10-NEXT:  ; %bb.3: ; %tail.loopexit
496; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
497; GISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v1
498; GISEL10-NEXT:  .LBB3_4: ; %Flow1
499; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
500; GISEL10-NEXT:    s_mov_b32 s3, exec_lo
501; GISEL10-NEXT:    ; implicit-def: $vgpr8
502; GISEL10-NEXT:    v_cmpx_lt_i32_e64 v12, v13
503; GISEL10-NEXT:    s_xor_b32 s3, exec_lo, s3
504; GISEL10-NEXT:  ; %bb.5: ; %tail.else
505; GISEL10-NEXT:    s_or_saveexec_b32 s4, -1
506; GISEL10-NEXT:    v_mov_b32_e32 v0, 15
507; GISEL10-NEXT:    s_mov_b32 exec_lo, s4
508; GISEL10-NEXT:    v_mov_b32_e32 v8, v0
509; GISEL10-NEXT:  ; %bb.6: ; %Flow
510; GISEL10-NEXT:    s_andn2_saveexec_b32 s3, s3
511; GISEL10-NEXT:  ; %bb.7: ; %tail.then
512; GISEL10-NEXT:    s_mov_b32 s4, 44
513; GISEL10-NEXT:    v_mov_b32_e32 v8, s4
514; GISEL10-NEXT:  ; %bb.8: ; %tail.end
515; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
516; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
517; GISEL10-NEXT:    s_setpc_b64 s[6:7]
518;
519; DAGISEL10-LABEL: control_flow:
520; DAGISEL10:       ; %bb.0: ; %entry
521; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522; DAGISEL10-NEXT:    s_or_saveexec_b32 s8, -1
523; DAGISEL10-NEXT:    s_mov_b32 s7, s4
524; DAGISEL10-NEXT:    s_mov_b32 s6, s3
525; DAGISEL10-NEXT:    s_and_saveexec_b32 s3, s8
526; DAGISEL10-NEXT:    s_cbranch_execz .LBB3_4
527; DAGISEL10-NEXT:  ; %bb.1: ; %shader.preheader
528; DAGISEL10-NEXT:    v_add_nc_u32_e32 v1, -1, v12
529; DAGISEL10-NEXT:    s_mov_b32 s4, 0
530; DAGISEL10-NEXT:  .LBB3_2: ; %shader
531; DAGISEL10-NEXT:    ; =>This Inner Loop Header: Depth=1
532; DAGISEL10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
533; DAGISEL10-NEXT:    s_or_saveexec_b32 s8, -1
534; DAGISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v1, s8
535; DAGISEL10-NEXT:    v_cmp_ne_u32_e64 s9, 0, v0
536; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s8
537; DAGISEL10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v13, v1
538; DAGISEL10-NEXT:    v_mov_b32_e32 v11, s9
539; DAGISEL10-NEXT:    s_or_b32 s4, vcc_lo, s4
540; DAGISEL10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
541; DAGISEL10-NEXT:    s_cbranch_execnz .LBB3_2
542; DAGISEL10-NEXT:  ; %bb.3: ; %tail.loopexit
543; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
544; DAGISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v1
545; DAGISEL10-NEXT:  .LBB3_4: ; %Flow1
546; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
547; DAGISEL10-NEXT:    s_mov_b32 s3, exec_lo
548; DAGISEL10-NEXT:    ; implicit-def: $vgpr8
549; DAGISEL10-NEXT:    v_cmpx_lt_i32_e64 v12, v13
550; DAGISEL10-NEXT:    s_xor_b32 s3, exec_lo, s3
551; DAGISEL10-NEXT:  ; %bb.5: ; %tail.else
552; DAGISEL10-NEXT:    s_mov_b32 s4, 15
553; DAGISEL10-NEXT:    v_mov_b32_e32 v8, s4
554; DAGISEL10-NEXT:  ; %bb.6: ; %Flow
555; DAGISEL10-NEXT:    s_andn2_saveexec_b32 s3, s3
556; DAGISEL10-NEXT:  ; %bb.7: ; %tail.then
557; DAGISEL10-NEXT:    v_mov_b32_e32 v8, 44
558; DAGISEL10-NEXT:  ; %bb.8: ; %tail.end
559; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
560; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s5
561; DAGISEL10-NEXT:    s_setpc_b64 s[6:7]
562entry:
563  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
564  br i1 %entry_exec, label %shader, label %tail
565
566shader:
567  %i = phi i32 [%x, %entry], [%i.inc, %shader]
568
569  %nonwwm = add i32 %i, 42
570  %vgpr.1 = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr, i32 %nonwwm, 2
571
572  %full.vgpr = call i32 @llvm.amdgcn.set.inactive.i32(i32 %i, i32 71)
573  %non.zero = icmp ne i32 %full.vgpr, 0
574  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %non.zero)
575  %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %ballot)
576  %vgpr.2 = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr.1, i32 %wwm, 3
577
578  %i.inc = add i32 %i, 1
579  %loop.cond = icmp ne i32 %i, %y
580  br i1 %loop.cond, label %shader, label %tail
581
582tail:
583  %vgpr.tail = phi { i32, ptr addrspace(5), i32, i32} [%vgpr, %entry], [%vgpr.2, %shader]
584
585  %if.cond = icmp sge i32 %x, %y
586  br i1 %if.cond, label %tail.then, label %tail.else
587
588tail.then:
589  %vgpr.then = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr.tail, i32 44, 0
590  br label %tail.end
591
592tail.else:
593  %wwm.tail = call i32 @llvm.amdgcn.strict.wwm.i32(i32 15)
594  %vgpr.else = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr.tail, i32 %wwm.tail, 0
595  br label %tail.end
596
597tail.end:
598  %vgpr.args = phi { i32, ptr addrspace(5), i32, i32 } [%vgpr.then, %tail.then], [%vgpr.else, %tail.else]
599  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
600  unreachable
601}
602
603; Try with v0-v7 occupied - this will force us to use higher registers for temporaries. Make sure we don't preserve them.
604define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
605; GISEL12-LABEL: use_v0_7:
606; GISEL12:       ; %bb.0: ; %entry
607; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
608; GISEL12-NEXT:    s_wait_expcnt 0x0
609; GISEL12-NEXT:    s_wait_samplecnt 0x0
610; GISEL12-NEXT:    s_wait_bvhcnt 0x0
611; GISEL12-NEXT:    s_wait_kmcnt 0x0
612; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
613; GISEL12-NEXT:    s_mov_b32 s6, s3
614; GISEL12-NEXT:    s_mov_b32 s7, s4
615; GISEL12-NEXT:    s_wait_alu 0xfffe
616; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
617; GISEL12-NEXT:    s_cbranch_execz .LBB4_2
618; GISEL12-NEXT:  ; %bb.1: ; %shader
619; GISEL12-NEXT:    s_or_saveexec_b32 s4, -1
620; GISEL12-NEXT:    s_wait_alu 0xfffe
621; GISEL12-NEXT:    v_cndmask_b32_e64 v13, 0x47, v12, s4
622; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
623; GISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v13
624; GISEL12-NEXT:    v_mov_b32_e32 v13, s8
625; GISEL12-NEXT:    s_mov_b32 exec_lo, s4
626; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
627; GISEL12-NEXT:    v_dual_mov_b32 v11, v13 :: v_dual_add_nc_u32 v10, 42, v12
628; GISEL12-NEXT:    ;;#ASMSTART
629; GISEL12-NEXT:    ; use v0-7
630; GISEL12-NEXT:    ;;#ASMEND
631; GISEL12-NEXT:  .LBB4_2: ; %tail
632; GISEL12-NEXT:    s_wait_alu 0xfffe
633; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
634; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
635; GISEL12-NEXT:    s_wait_alu 0xfffe
636; GISEL12-NEXT:    s_setpc_b64 s[6:7]
637;
638; DAGISEL12-LABEL: use_v0_7:
639; DAGISEL12:       ; %bb.0: ; %entry
640; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
641; DAGISEL12-NEXT:    s_wait_expcnt 0x0
642; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
643; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
644; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
645; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
646; DAGISEL12-NEXT:    s_mov_b32 s7, s4
647; DAGISEL12-NEXT:    s_mov_b32 s6, s3
648; DAGISEL12-NEXT:    s_wait_alu 0xfffe
649; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
650; DAGISEL12-NEXT:    s_cbranch_execz .LBB4_2
651; DAGISEL12-NEXT:  ; %bb.1: ; %shader
652; DAGISEL12-NEXT:    s_or_saveexec_b32 s4, -1
653; DAGISEL12-NEXT:    s_wait_alu 0xfffe
654; DAGISEL12-NEXT:    v_cndmask_b32_e64 v13, 0x47, v12, s4
655; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
656; DAGISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v13
657; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s4
658; DAGISEL12-NEXT:    v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v12
659; DAGISEL12-NEXT:    ;;#ASMSTART
660; DAGISEL12-NEXT:    ; use v0-7
661; DAGISEL12-NEXT:    ;;#ASMEND
662; DAGISEL12-NEXT:  .LBB4_2: ; %tail
663; DAGISEL12-NEXT:    s_wait_alu 0xfffe
664; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
665; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
666; DAGISEL12-NEXT:    s_wait_alu 0xfffe
667; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
668;
669; GISEL10-LABEL: use_v0_7:
670; GISEL10:       ; %bb.0: ; %entry
671; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
672; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
673; GISEL10-NEXT:    s_mov_b32 s6, s3
674; GISEL10-NEXT:    s_mov_b32 s7, s4
675; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
676; GISEL10-NEXT:    s_cbranch_execz .LBB4_2
677; GISEL10-NEXT:  ; %bb.1: ; %shader
678; GISEL10-NEXT:    s_or_saveexec_b32 s4, -1
679; GISEL10-NEXT:    v_cndmask_b32_e64 v13, 0x47, v12, s4
680; GISEL10-NEXT:    v_cmp_ne_u32_e64 s8, 0, v13
681; GISEL10-NEXT:    v_mov_b32_e32 v13, s8
682; GISEL10-NEXT:    s_mov_b32 exec_lo, s4
683; GISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v12
684; GISEL10-NEXT:    v_mov_b32_e32 v11, v13
685; GISEL10-NEXT:    ;;#ASMSTART
686; GISEL10-NEXT:    ; use v0-7
687; GISEL10-NEXT:    ;;#ASMEND
688; GISEL10-NEXT:  .LBB4_2: ; %tail
689; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
690; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
691; GISEL10-NEXT:    s_setpc_b64 s[6:7]
692;
693; DAGISEL10-LABEL: use_v0_7:
694; DAGISEL10:       ; %bb.0: ; %entry
695; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
696; DAGISEL10-NEXT:    s_or_saveexec_b32 s8, -1
697; DAGISEL10-NEXT:    s_mov_b32 s7, s4
698; DAGISEL10-NEXT:    s_mov_b32 s6, s3
699; DAGISEL10-NEXT:    s_and_saveexec_b32 s3, s8
700; DAGISEL10-NEXT:    s_cbranch_execz .LBB4_2
701; DAGISEL10-NEXT:  ; %bb.1: ; %shader
702; DAGISEL10-NEXT:    s_or_saveexec_b32 s4, -1
703; DAGISEL10-NEXT:    v_cndmask_b32_e64 v13, 0x47, v12, s4
704; DAGISEL10-NEXT:    v_cmp_ne_u32_e64 s8, 0, v13
705; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s4
706; DAGISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v12
707; DAGISEL10-NEXT:    v_mov_b32_e32 v11, s8
708; DAGISEL10-NEXT:    ;;#ASMSTART
709; DAGISEL10-NEXT:    ; use v0-7
710; DAGISEL10-NEXT:    ;;#ASMEND
711; DAGISEL10-NEXT:  .LBB4_2: ; %tail
712; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
713; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s5
714; DAGISEL10-NEXT:    s_setpc_b64 s[6:7]
715entry:
716  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
717  br i1 %entry_exec, label %shader, label %tail
718
719shader:
720  call void asm sideeffect "; use v0-7", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"()
721
722  %nonwwm = add i32 %x, 42
723  %vgpr.1 = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr, i32 %nonwwm, 2
724
725  %full.vgpr = call i32 @llvm.amdgcn.set.inactive.i32(i32 %x, i32 71)
726  %non.zero = icmp ne i32 %full.vgpr, 0
727  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %non.zero)
728  %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %ballot)
729  %vgpr.2 = insertvalue { i32, ptr addrspace(5), i32, i32} %vgpr.1, i32 %wwm, 3
730
731  br label %tail
732
733tail:
734  %vgpr.args = phi { i32, ptr addrspace(5), i32, i32} [%vgpr, %entry], [%vgpr.2, %shader]
735  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
736  unreachable
737}
738
739
740; Check that the inactive lanes of v8:15 are correctly preserved even across a
741; WWM call that reads and writes them.
742; FIXME: The GlobalISel path hits a pre-existing issue, so the inactive lanes do get overwritten.
743define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, <16 x i32> %vgpr, i32 %x, i32 %y) {
744; GISEL12-LABEL: wwm_write_to_arg_reg:
745; GISEL12:       ; %bb.0: ; %entry
746; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
747; GISEL12-NEXT:    s_wait_expcnt 0x0
748; GISEL12-NEXT:    s_wait_samplecnt 0x0
749; GISEL12-NEXT:    s_wait_bvhcnt 0x0
750; GISEL12-NEXT:    s_wait_kmcnt 0x0
751; GISEL12-NEXT:    s_mov_b32 s32, 0
752; GISEL12-NEXT:    s_or_saveexec_b32 s9, -1
753; GISEL12-NEXT:    s_or_saveexec_b32 s12, -1
754; GISEL12-NEXT:    s_mov_b32 s6, s0
755; GISEL12-NEXT:    s_mov_b32 s7, s1
756; GISEL12-NEXT:    s_mov_b32 s8, s2
757; GISEL12-NEXT:    s_mov_b32 s10, s3
758; GISEL12-NEXT:    s_mov_b32 s11, s4
759; GISEL12-NEXT:    v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
760; GISEL12-NEXT:    v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v27, v11
761; GISEL12-NEXT:    v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v29, v13
762; GISEL12-NEXT:    v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v31, v15
763; GISEL12-NEXT:    v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v33, v17
764; GISEL12-NEXT:    v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v35, v19
765; GISEL12-NEXT:    v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v37, v21
766; GISEL12-NEXT:    v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v39, v23
767; GISEL12-NEXT:    s_wait_alu 0xfffe
768; GISEL12-NEXT:    s_mov_b32 exec_lo, s12
769; GISEL12-NEXT:    s_and_saveexec_b32 s4, s9
770; GISEL12-NEXT:    s_cbranch_execz .LBB5_2
771; GISEL12-NEXT:  ; %bb.1: ; %shader
772; GISEL12-NEXT:    s_or_saveexec_b32 s9, -1
773; GISEL12-NEXT:    s_getpc_b64 s[0:1]
774; GISEL12-NEXT:    s_wait_alu 0xfffe
775; GISEL12-NEXT:    s_sext_i32_i16 s1, s1
776; GISEL12-NEXT:    s_add_co_u32 s0, s0, write_v0_v15@gotpcrel32@lo+12
777; GISEL12-NEXT:    s_wait_alu 0xfffe
778; GISEL12-NEXT:    s_add_co_ci_u32 s1, s1, write_v0_v15@gotpcrel32@hi+24
779; GISEL12-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
780; GISEL12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
781; GISEL12-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
782; GISEL12-NEXT:    v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
783; GISEL12-NEXT:    v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
784; GISEL12-NEXT:    v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
785; GISEL12-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
786; GISEL12-NEXT:    v_dual_mov_b32 v12, v36 :: v_dual_mov_b32 v13, v37
787; GISEL12-NEXT:    v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39
788; GISEL12-NEXT:    s_wait_kmcnt 0x0
789; GISEL12-NEXT:    s_wait_alu 0xfffe
790; GISEL12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
791; GISEL12-NEXT:    v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1
792; GISEL12-NEXT:    v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3
793; GISEL12-NEXT:    v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5
794; GISEL12-NEXT:    v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7
795; GISEL12-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
796; GISEL12-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
797; GISEL12-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
798; GISEL12-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
799; GISEL12-NEXT:    s_mov_b32 exec_lo, s9
800; GISEL12-NEXT:    ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
801; GISEL12-NEXT:  .LBB5_2: ; %tail
802; GISEL12-NEXT:    s_wait_alu 0xfffe
803; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
804; GISEL12-NEXT:    v_dual_mov_b32 v8, v24 :: v_dual_mov_b32 v9, v25
805; GISEL12-NEXT:    v_dual_mov_b32 v10, v26 :: v_dual_mov_b32 v11, v27
806; GISEL12-NEXT:    v_dual_mov_b32 v12, v28 :: v_dual_mov_b32 v13, v29
807; GISEL12-NEXT:    v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v15, v31
808; GISEL12-NEXT:    v_dual_mov_b32 v16, v32 :: v_dual_mov_b32 v17, v33
809; GISEL12-NEXT:    v_dual_mov_b32 v18, v34 :: v_dual_mov_b32 v19, v35
810; GISEL12-NEXT:    v_dual_mov_b32 v20, v36 :: v_dual_mov_b32 v21, v37
811; GISEL12-NEXT:    v_dual_mov_b32 v22, v38 :: v_dual_mov_b32 v23, v39
812; GISEL12-NEXT:    s_mov_b32 s0, s6
813; GISEL12-NEXT:    s_mov_b32 s1, s7
814; GISEL12-NEXT:    s_mov_b32 s2, s8
815; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
816; GISEL12-NEXT:    s_wait_alu 0xfffe
817; GISEL12-NEXT:    s_setpc_b64 s[10:11]
818;
819; DAGISEL12-LABEL: wwm_write_to_arg_reg:
820; DAGISEL12:       ; %bb.0: ; %entry
821; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
822; DAGISEL12-NEXT:    s_wait_expcnt 0x0
823; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
824; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
825; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
826; DAGISEL12-NEXT:    s_mov_b32 s32, 0
827; DAGISEL12-NEXT:    s_or_saveexec_b32 s11, -1
828; DAGISEL12-NEXT:    s_or_saveexec_b32 s6, -1
829; DAGISEL12-NEXT:    v_dual_mov_b32 v39, v23 :: v_dual_mov_b32 v38, v22
830; DAGISEL12-NEXT:    v_dual_mov_b32 v37, v21 :: v_dual_mov_b32 v36, v20
831; DAGISEL12-NEXT:    v_dual_mov_b32 v35, v19 :: v_dual_mov_b32 v34, v18
832; DAGISEL12-NEXT:    v_dual_mov_b32 v33, v17 :: v_dual_mov_b32 v32, v16
833; DAGISEL12-NEXT:    v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v30, v14
834; DAGISEL12-NEXT:    v_dual_mov_b32 v29, v13 :: v_dual_mov_b32 v28, v12
835; DAGISEL12-NEXT:    v_dual_mov_b32 v27, v11 :: v_dual_mov_b32 v26, v10
836; DAGISEL12-NEXT:    v_dual_mov_b32 v25, v9 :: v_dual_mov_b32 v24, v8
837; DAGISEL12-NEXT:    s_wait_alu 0xfffe
838; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s6
839; DAGISEL12-NEXT:    s_mov_b32 s9, s4
840; DAGISEL12-NEXT:    s_mov_b32 s8, s3
841; DAGISEL12-NEXT:    s_mov_b32 s4, s2
842; DAGISEL12-NEXT:    s_mov_b32 s6, s1
843; DAGISEL12-NEXT:    s_mov_b32 s7, s0
844; DAGISEL12-NEXT:    s_and_saveexec_b32 s10, s11
845; DAGISEL12-NEXT:    s_cbranch_execz .LBB5_2
846; DAGISEL12-NEXT:  ; %bb.1: ; %shader
847; DAGISEL12-NEXT:    s_or_saveexec_b32 s11, -1
848; DAGISEL12-NEXT:    s_getpc_b64 s[0:1]
849; DAGISEL12-NEXT:    s_wait_alu 0xfffe
850; DAGISEL12-NEXT:    s_sext_i32_i16 s1, s1
851; DAGISEL12-NEXT:    s_add_co_u32 s0, s0, write_v0_v15@gotpcrel32@lo+12
852; DAGISEL12-NEXT:    s_wait_alu 0xfffe
853; DAGISEL12-NEXT:    s_add_co_ci_u32 s1, s1, write_v0_v15@gotpcrel32@hi+24
854; DAGISEL12-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
855; DAGISEL12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
856; DAGISEL12-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
857; DAGISEL12-NEXT:    v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
858; DAGISEL12-NEXT:    v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
859; DAGISEL12-NEXT:    v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
860; DAGISEL12-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
861; DAGISEL12-NEXT:    v_dual_mov_b32 v12, v36 :: v_dual_mov_b32 v13, v37
862; DAGISEL12-NEXT:    v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39
863; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
864; DAGISEL12-NEXT:    s_wait_alu 0xfffe
865; DAGISEL12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
866; DAGISEL12-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
867; DAGISEL12-NEXT:    v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3
868; DAGISEL12-NEXT:    v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5
869; DAGISEL12-NEXT:    v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7
870; DAGISEL12-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9
871; DAGISEL12-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11
872; DAGISEL12-NEXT:    v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13
873; DAGISEL12-NEXT:    v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15
874; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s11
875; DAGISEL12-NEXT:    v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41
876; DAGISEL12-NEXT:    v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43
877; DAGISEL12-NEXT:    v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45
878; DAGISEL12-NEXT:    v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47
879; DAGISEL12-NEXT:    v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49
880; DAGISEL12-NEXT:    v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51
881; DAGISEL12-NEXT:    v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53
882; DAGISEL12-NEXT:    v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55
883; DAGISEL12-NEXT:  .LBB5_2: ; %tail
884; DAGISEL12-NEXT:    s_wait_alu 0xfffe
885; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s10
886; DAGISEL12-NEXT:    v_dual_mov_b32 v8, v24 :: v_dual_mov_b32 v9, v25
887; DAGISEL12-NEXT:    v_dual_mov_b32 v10, v26 :: v_dual_mov_b32 v11, v27
888; DAGISEL12-NEXT:    v_dual_mov_b32 v12, v28 :: v_dual_mov_b32 v13, v29
889; DAGISEL12-NEXT:    v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v15, v31
890; DAGISEL12-NEXT:    v_dual_mov_b32 v16, v32 :: v_dual_mov_b32 v17, v33
891; DAGISEL12-NEXT:    v_dual_mov_b32 v18, v34 :: v_dual_mov_b32 v19, v35
892; DAGISEL12-NEXT:    v_dual_mov_b32 v20, v36 :: v_dual_mov_b32 v21, v37
893; DAGISEL12-NEXT:    v_dual_mov_b32 v22, v38 :: v_dual_mov_b32 v23, v39
894; DAGISEL12-NEXT:    s_mov_b32 s0, s7
895; DAGISEL12-NEXT:    s_mov_b32 s1, s6
896; DAGISEL12-NEXT:    s_mov_b32 s2, s4
897; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
898; DAGISEL12-NEXT:    s_wait_alu 0xfffe
899; DAGISEL12-NEXT:    s_setpc_b64 s[8:9]
900;
901; GISEL10-LABEL: wwm_write_to_arg_reg:
902; GISEL10:       ; %bb.0: ; %entry
903; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
904; GISEL10-NEXT:    s_mov_b32 s32, 0
905; GISEL10-NEXT:    s_or_saveexec_b32 s9, -1
906; GISEL10-NEXT:    s_or_saveexec_b32 s12, -1
907; GISEL10-NEXT:    s_mov_b32 s6, s0
908; GISEL10-NEXT:    s_mov_b32 s7, s1
909; GISEL10-NEXT:    s_mov_b32 s8, s2
910; GISEL10-NEXT:    s_mov_b32 s10, s3
911; GISEL10-NEXT:    s_mov_b32 s11, s4
912; GISEL10-NEXT:    v_mov_b32_e32 v24, v8
913; GISEL10-NEXT:    v_mov_b32_e32 v25, v9
914; GISEL10-NEXT:    v_mov_b32_e32 v26, v10
915; GISEL10-NEXT:    v_mov_b32_e32 v27, v11
916; GISEL10-NEXT:    v_mov_b32_e32 v28, v12
917; GISEL10-NEXT:    v_mov_b32_e32 v29, v13
918; GISEL10-NEXT:    v_mov_b32_e32 v30, v14
919; GISEL10-NEXT:    v_mov_b32_e32 v31, v15
920; GISEL10-NEXT:    v_mov_b32_e32 v32, v16
921; GISEL10-NEXT:    v_mov_b32_e32 v33, v17
922; GISEL10-NEXT:    v_mov_b32_e32 v34, v18
923; GISEL10-NEXT:    v_mov_b32_e32 v35, v19
924; GISEL10-NEXT:    v_mov_b32_e32 v36, v20
925; GISEL10-NEXT:    v_mov_b32_e32 v37, v21
926; GISEL10-NEXT:    v_mov_b32_e32 v38, v22
927; GISEL10-NEXT:    v_mov_b32_e32 v39, v23
928; GISEL10-NEXT:    s_mov_b32 exec_lo, s12
929; GISEL10-NEXT:    s_and_saveexec_b32 s4, s9
930; GISEL10-NEXT:    s_cbranch_execz .LBB5_2
931; GISEL10-NEXT:  ; %bb.1: ; %shader
932; GISEL10-NEXT:    s_or_saveexec_b32 s9, -1
933; GISEL10-NEXT:    s_getpc_b64 s[0:1]
934; GISEL10-NEXT:    s_add_u32 s0, s0, write_v0_v15@gotpcrel32@lo+4
935; GISEL10-NEXT:    s_addc_u32 s1, s1, write_v0_v15@gotpcrel32@hi+12
936; GISEL10-NEXT:    v_mov_b32_e32 v0, v24
937; GISEL10-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
938; GISEL10-NEXT:    v_mov_b32_e32 v1, v25
939; GISEL10-NEXT:    v_mov_b32_e32 v2, v26
940; GISEL10-NEXT:    v_mov_b32_e32 v3, v27
941; GISEL10-NEXT:    v_mov_b32_e32 v4, v28
942; GISEL10-NEXT:    v_mov_b32_e32 v5, v29
943; GISEL10-NEXT:    v_mov_b32_e32 v6, v30
944; GISEL10-NEXT:    v_mov_b32_e32 v7, v31
945; GISEL10-NEXT:    v_mov_b32_e32 v8, v32
946; GISEL10-NEXT:    v_mov_b32_e32 v9, v33
947; GISEL10-NEXT:    v_mov_b32_e32 v10, v34
948; GISEL10-NEXT:    v_mov_b32_e32 v11, v35
949; GISEL10-NEXT:    v_mov_b32_e32 v12, v36
950; GISEL10-NEXT:    v_mov_b32_e32 v13, v37
951; GISEL10-NEXT:    v_mov_b32_e32 v14, v38
952; GISEL10-NEXT:    v_mov_b32_e32 v15, v39
953; GISEL10-NEXT:    s_mov_b64 s[0:1], s[48:49]
954; GISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
955; GISEL10-NEXT:    s_waitcnt lgkmcnt(0)
956; GISEL10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
957; GISEL10-NEXT:    v_mov_b32_e32 v24, v0
958; GISEL10-NEXT:    v_mov_b32_e32 v25, v1
959; GISEL10-NEXT:    v_mov_b32_e32 v26, v2
960; GISEL10-NEXT:    v_mov_b32_e32 v27, v3
961; GISEL10-NEXT:    v_mov_b32_e32 v28, v4
962; GISEL10-NEXT:    v_mov_b32_e32 v29, v5
963; GISEL10-NEXT:    v_mov_b32_e32 v30, v6
964; GISEL10-NEXT:    v_mov_b32_e32 v31, v7
965; GISEL10-NEXT:    v_mov_b32_e32 v32, v8
966; GISEL10-NEXT:    v_mov_b32_e32 v33, v9
967; GISEL10-NEXT:    v_mov_b32_e32 v34, v10
968; GISEL10-NEXT:    v_mov_b32_e32 v35, v11
969; GISEL10-NEXT:    v_mov_b32_e32 v36, v12
970; GISEL10-NEXT:    v_mov_b32_e32 v37, v13
971; GISEL10-NEXT:    v_mov_b32_e32 v38, v14
972; GISEL10-NEXT:    v_mov_b32_e32 v39, v15
973; GISEL10-NEXT:    s_mov_b32 exec_lo, s9
974; GISEL10-NEXT:    ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
975; GISEL10-NEXT:  .LBB5_2: ; %tail
976; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
977; GISEL10-NEXT:    v_mov_b32_e32 v8, v24
978; GISEL10-NEXT:    v_mov_b32_e32 v9, v25
979; GISEL10-NEXT:    v_mov_b32_e32 v10, v26
980; GISEL10-NEXT:    v_mov_b32_e32 v11, v27
981; GISEL10-NEXT:    v_mov_b32_e32 v12, v28
982; GISEL10-NEXT:    v_mov_b32_e32 v13, v29
983; GISEL10-NEXT:    v_mov_b32_e32 v14, v30
984; GISEL10-NEXT:    v_mov_b32_e32 v15, v31
985; GISEL10-NEXT:    v_mov_b32_e32 v16, v32
986; GISEL10-NEXT:    v_mov_b32_e32 v17, v33
987; GISEL10-NEXT:    v_mov_b32_e32 v18, v34
988; GISEL10-NEXT:    v_mov_b32_e32 v19, v35
989; GISEL10-NEXT:    v_mov_b32_e32 v20, v36
990; GISEL10-NEXT:    v_mov_b32_e32 v21, v37
991; GISEL10-NEXT:    v_mov_b32_e32 v22, v38
992; GISEL10-NEXT:    v_mov_b32_e32 v23, v39
993; GISEL10-NEXT:    s_mov_b32 s0, s6
994; GISEL10-NEXT:    s_mov_b32 s1, s7
995; GISEL10-NEXT:    s_mov_b32 s2, s8
996; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
997; GISEL10-NEXT:    s_setpc_b64 s[10:11]
998;
999; DAGISEL10-LABEL: wwm_write_to_arg_reg:
1000; DAGISEL10:       ; %bb.0: ; %entry
1001; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1002; DAGISEL10-NEXT:    s_mov_b32 s32, 0
1003; DAGISEL10-NEXT:    s_or_saveexec_b32 s11, -1
1004; DAGISEL10-NEXT:    s_or_saveexec_b32 s6, -1
1005; DAGISEL10-NEXT:    v_mov_b32_e32 v39, v23
1006; DAGISEL10-NEXT:    v_mov_b32_e32 v38, v22
1007; DAGISEL10-NEXT:    v_mov_b32_e32 v37, v21
1008; DAGISEL10-NEXT:    v_mov_b32_e32 v36, v20
1009; DAGISEL10-NEXT:    v_mov_b32_e32 v35, v19
1010; DAGISEL10-NEXT:    v_mov_b32_e32 v34, v18
1011; DAGISEL10-NEXT:    v_mov_b32_e32 v33, v17
1012; DAGISEL10-NEXT:    v_mov_b32_e32 v32, v16
1013; DAGISEL10-NEXT:    v_mov_b32_e32 v31, v15
1014; DAGISEL10-NEXT:    v_mov_b32_e32 v30, v14
1015; DAGISEL10-NEXT:    v_mov_b32_e32 v29, v13
1016; DAGISEL10-NEXT:    v_mov_b32_e32 v28, v12
1017; DAGISEL10-NEXT:    v_mov_b32_e32 v27, v11
1018; DAGISEL10-NEXT:    v_mov_b32_e32 v26, v10
1019; DAGISEL10-NEXT:    v_mov_b32_e32 v25, v9
1020; DAGISEL10-NEXT:    v_mov_b32_e32 v24, v8
1021; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s6
1022; DAGISEL10-NEXT:    s_mov_b32 s9, s4
1023; DAGISEL10-NEXT:    s_mov_b32 s8, s3
1024; DAGISEL10-NEXT:    s_mov_b32 s4, s2
1025; DAGISEL10-NEXT:    s_mov_b32 s6, s1
1026; DAGISEL10-NEXT:    s_mov_b32 s7, s0
1027; DAGISEL10-NEXT:    s_and_saveexec_b32 s10, s11
1028; DAGISEL10-NEXT:    s_cbranch_execz .LBB5_2
1029; DAGISEL10-NEXT:  ; %bb.1: ; %shader
1030; DAGISEL10-NEXT:    s_or_saveexec_b32 s11, -1
1031; DAGISEL10-NEXT:    s_getpc_b64 s[0:1]
1032; DAGISEL10-NEXT:    s_add_u32 s0, s0, write_v0_v15@gotpcrel32@lo+4
1033; DAGISEL10-NEXT:    s_addc_u32 s1, s1, write_v0_v15@gotpcrel32@hi+12
1034; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v24
1035; DAGISEL10-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
1036; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v25
1037; DAGISEL10-NEXT:    v_mov_b32_e32 v2, v26
1038; DAGISEL10-NEXT:    v_mov_b32_e32 v3, v27
1039; DAGISEL10-NEXT:    v_mov_b32_e32 v4, v28
1040; DAGISEL10-NEXT:    v_mov_b32_e32 v5, v29
1041; DAGISEL10-NEXT:    v_mov_b32_e32 v6, v30
1042; DAGISEL10-NEXT:    v_mov_b32_e32 v7, v31
1043; DAGISEL10-NEXT:    v_mov_b32_e32 v8, v32
1044; DAGISEL10-NEXT:    v_mov_b32_e32 v9, v33
1045; DAGISEL10-NEXT:    v_mov_b32_e32 v10, v34
1046; DAGISEL10-NEXT:    v_mov_b32_e32 v11, v35
1047; DAGISEL10-NEXT:    v_mov_b32_e32 v12, v36
1048; DAGISEL10-NEXT:    v_mov_b32_e32 v13, v37
1049; DAGISEL10-NEXT:    v_mov_b32_e32 v14, v38
1050; DAGISEL10-NEXT:    v_mov_b32_e32 v15, v39
1051; DAGISEL10-NEXT:    s_mov_b64 s[0:1], s[48:49]
1052; DAGISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
1053; DAGISEL10-NEXT:    s_waitcnt lgkmcnt(0)
1054; DAGISEL10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
1055; DAGISEL10-NEXT:    v_mov_b32_e32 v40, v0
1056; DAGISEL10-NEXT:    v_mov_b32_e32 v41, v1
1057; DAGISEL10-NEXT:    v_mov_b32_e32 v42, v2
1058; DAGISEL10-NEXT:    v_mov_b32_e32 v43, v3
1059; DAGISEL10-NEXT:    v_mov_b32_e32 v44, v4
1060; DAGISEL10-NEXT:    v_mov_b32_e32 v45, v5
1061; DAGISEL10-NEXT:    v_mov_b32_e32 v46, v6
1062; DAGISEL10-NEXT:    v_mov_b32_e32 v47, v7
1063; DAGISEL10-NEXT:    v_mov_b32_e32 v48, v8
1064; DAGISEL10-NEXT:    v_mov_b32_e32 v49, v9
1065; DAGISEL10-NEXT:    v_mov_b32_e32 v50, v10
1066; DAGISEL10-NEXT:    v_mov_b32_e32 v51, v11
1067; DAGISEL10-NEXT:    v_mov_b32_e32 v52, v12
1068; DAGISEL10-NEXT:    v_mov_b32_e32 v53, v13
1069; DAGISEL10-NEXT:    v_mov_b32_e32 v54, v14
1070; DAGISEL10-NEXT:    v_mov_b32_e32 v55, v15
1071; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s11
1072; DAGISEL10-NEXT:    v_mov_b32_e32 v24, v40
1073; DAGISEL10-NEXT:    v_mov_b32_e32 v25, v41
1074; DAGISEL10-NEXT:    v_mov_b32_e32 v26, v42
1075; DAGISEL10-NEXT:    v_mov_b32_e32 v27, v43
1076; DAGISEL10-NEXT:    v_mov_b32_e32 v28, v44
1077; DAGISEL10-NEXT:    v_mov_b32_e32 v29, v45
1078; DAGISEL10-NEXT:    v_mov_b32_e32 v30, v46
1079; DAGISEL10-NEXT:    v_mov_b32_e32 v31, v47
1080; DAGISEL10-NEXT:    v_mov_b32_e32 v32, v48
1081; DAGISEL10-NEXT:    v_mov_b32_e32 v33, v49
1082; DAGISEL10-NEXT:    v_mov_b32_e32 v34, v50
1083; DAGISEL10-NEXT:    v_mov_b32_e32 v35, v51
1084; DAGISEL10-NEXT:    v_mov_b32_e32 v36, v52
1085; DAGISEL10-NEXT:    v_mov_b32_e32 v37, v53
1086; DAGISEL10-NEXT:    v_mov_b32_e32 v38, v54
1087; DAGISEL10-NEXT:    v_mov_b32_e32 v39, v55
1088; DAGISEL10-NEXT:  .LBB5_2: ; %tail
1089; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s10
1090; DAGISEL10-NEXT:    v_mov_b32_e32 v8, v24
1091; DAGISEL10-NEXT:    v_mov_b32_e32 v9, v25
1092; DAGISEL10-NEXT:    v_mov_b32_e32 v10, v26
1093; DAGISEL10-NEXT:    v_mov_b32_e32 v11, v27
1094; DAGISEL10-NEXT:    v_mov_b32_e32 v12, v28
1095; DAGISEL10-NEXT:    v_mov_b32_e32 v13, v29
1096; DAGISEL10-NEXT:    v_mov_b32_e32 v14, v30
1097; DAGISEL10-NEXT:    v_mov_b32_e32 v15, v31
1098; DAGISEL10-NEXT:    v_mov_b32_e32 v16, v32
1099; DAGISEL10-NEXT:    v_mov_b32_e32 v17, v33
1100; DAGISEL10-NEXT:    v_mov_b32_e32 v18, v34
1101; DAGISEL10-NEXT:    v_mov_b32_e32 v19, v35
1102; DAGISEL10-NEXT:    v_mov_b32_e32 v20, v36
1103; DAGISEL10-NEXT:    v_mov_b32_e32 v21, v37
1104; DAGISEL10-NEXT:    v_mov_b32_e32 v22, v38
1105; DAGISEL10-NEXT:    v_mov_b32_e32 v23, v39
1106; DAGISEL10-NEXT:    s_mov_b32 s0, s7
1107; DAGISEL10-NEXT:    s_mov_b32 s1, s6
1108; DAGISEL10-NEXT:    s_mov_b32 s2, s4
1109; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s5
1110; DAGISEL10-NEXT:    s_setpc_b64 s[8:9]
1111entry:
1112  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
1113  br i1 %entry_exec, label %shader, label %tail
1114
1115shader:
1116  %v0.15 = call amdgpu_gfx <16 x i32> @write_v0_v15(<16 x i32> %vgpr)
1117  %vgpr.wwm = call <16 x i32> @llvm.amdgcn.strict.wwm.v16i32(<16 x i32> %v0.15)
1118
1119  br label %tail
1120
1121tail:
1122  %vgpr.args = phi <16 x i32> [%vgpr, %entry], [%vgpr.wwm, %shader]
1123  call void(ptr, i32, <3 x i32>, <16 x i32>, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, <16 x i32> %vgpr.args, i32 0)
1124  unreachable
1125}
1126
1127declare amdgpu_gfx <16 x i32> @write_v0_v15(<16 x i32>)
1128