xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll (revision 54d31bde324523d946fd87f5c5d5e271826209d6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
3
4; This file contains various tests that have divergent i1s used outside of
5; the loop. These are lane masks is sgpr and need to have correct value in
6; corresponding bit at the iteration lane exits the loop.
7; Achieved by merging lane mask with same lane mask from previous iteration
8; and using that merged lane mask outside of the loop.
9
10; Phi used outside of the loop directly (loopfinder will figure out that it
11; needs to merge lane mask across all iterations)
12define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ptr %addr) {
13; GFX10-LABEL: divergent_i1_phi_used_outside_loop:
14; GFX10:       ; %bb.0: ; %entry
15; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
17; GFX10-NEXT:    s_mov_b32 s4, 0
18; GFX10-NEXT:    v_mov_b32_e32 v1, s4
19; GFX10-NEXT:    s_andn2_b32 s5, s4, exec_lo
20; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
21; GFX10-NEXT:    s_or_b32 s6, s5, s6
22; GFX10-NEXT:    ; implicit-def: $sgpr5
23; GFX10-NEXT:  .LBB0_1: ; %loop
24; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
25; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v1
26; GFX10-NEXT:    s_xor_b32 s7, s6, -1
27; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
28; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
29; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
30; GFX10-NEXT:    s_andn2_b32 s8, s6, exec_lo
31; GFX10-NEXT:    s_and_b32 s7, exec_lo, s7
32; GFX10-NEXT:    s_andn2_b32 s5, s5, exec_lo
33; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
34; GFX10-NEXT:    s_or_b32 s7, s8, s7
35; GFX10-NEXT:    s_or_b32 s5, s5, s6
36; GFX10-NEXT:    s_mov_b32 s6, s7
37; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
38; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
39; GFX10-NEXT:  ; %bb.2: ; %exit
40; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
41; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s5
42; GFX10-NEXT:    flat_store_dword v[2:3], v0
43; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-NEXT:    s_setpc_b64 s[30:31]
45entry:
46  %pre.cond = fcmp ogt float %pre.cond.val, 1.0
47  br label %loop
48
49loop:
50  %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
51  %bool.counter = phi i1 [ %pre.cond, %entry ], [ %neg.bool.counter, %loop ]
52  %neg.bool.counter = xor i1 %bool.counter, true
53  %f.counter = uitofp i32 %counter to float
54  %cond = fcmp ogt float %f.counter, %val
55  %counter.plus.1 = add i32 %counter, 1
56  br i1 %cond, label %exit, label %loop
57
58exit:
59  %select = select i1 %bool.counter, float 1.000000e+00, float 0.000000e+00
60  store float %select, ptr %addr
61  ret void
62}
63
64define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr addrspace(1) %a, ptr %addr) {
65; GFX10-LABEL: divergent_i1_phi_used_outside_loop_larger_loop_body:
66; GFX10:       ; %bb.0: ; %entry
67; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68; GFX10-NEXT:    s_mov_b32 s4, -1
69; GFX10-NEXT:    ; implicit-def: $sgpr6
70; GFX10-NEXT:    v_mov_b32_e32 v0, s4
71; GFX10-NEXT:    s_andn2_b32 s5, s4, exec_lo
72; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
73; GFX10-NEXT:    s_or_b32 s4, s5, s4
74; GFX10-NEXT:    s_branch .LBB1_2
75; GFX10-NEXT:  .LBB1_1: ; %loop.cond
76; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
77; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
78; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, 4
79; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
80; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
81; GFX10-NEXT:    s_andn2_b32 s7, s5, exec_lo
82; GFX10-NEXT:    s_and_b32 s8, exec_lo, s6
83; GFX10-NEXT:    v_cmp_le_i32_e32 vcc_lo, 10, v0
84; GFX10-NEXT:    s_or_b32 s4, s7, s8
85; GFX10-NEXT:    s_cbranch_vccz .LBB1_4
86; GFX10-NEXT:  .LBB1_2: ; %loop.start
87; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
88; GFX10-NEXT:    s_mov_b32 s5, s4
89; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
90; GFX10-NEXT:    s_and_b32 s6, exec_lo, s5
91; GFX10-NEXT:    s_or_b32 s6, s4, s6
92; GFX10-NEXT:    s_and_saveexec_b32 s4, s5
93; GFX10-NEXT:    s_cbranch_execz .LBB1_1
94; GFX10-NEXT:  ; %bb.3: ; %is.eq.zero
95; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
96; GFX10-NEXT:    global_load_dword v5, v[1:2], off
97; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
98; GFX10-NEXT:    s_waitcnt vmcnt(0)
99; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
100; GFX10-NEXT:    s_and_b32 s7, exec_lo, vcc_lo
101; GFX10-NEXT:    s_or_b32 s6, s6, s7
102; GFX10-NEXT:    s_branch .LBB1_1
103; GFX10-NEXT:  .LBB1_4: ; %exit
104; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s5
105; GFX10-NEXT:    flat_store_dword v[3:4], v0
106; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX10-NEXT:    s_setpc_b64 s[30:31]
108entry:
109  br label %loop.start
110
111loop.start:
112  %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
113  %all.eq.zero = phi i1 [ true, %entry ], [ %eq.zero, %loop.cond ]
114  br i1 %all.eq.zero, label %is.eq.zero, label %loop.cond
115
116is.eq.zero:
117  %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
118  %elt.i = load i32, ptr addrspace(1) %a.plus.i
119  %elt.i.eq.zero = icmp eq i32 %elt.i, 0
120  br label %loop.cond
121
122loop.cond:
123  %eq.zero = phi i1 [ %all.eq.zero, %loop.start ], [ %elt.i.eq.zero, %is.eq.zero ]
124  %cond = icmp slt i32 %i, 10
125  %i.plus.1 = add i32 %i, 1
126  br i1 %cond, label %exit, label %loop.start
127
128exit:
129  %select = select i1 %all.eq.zero, float 1.000000e+00, float 0.000000e+00
130  store float %select, ptr %addr
131  ret void
132}
133
134; Non-phi used outside of the loop
135
136define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ptr %addr) {
137; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
138; GFX10:       ; %bb.0: ; %entry
139; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140; GFX10-NEXT:    s_mov_b32 s4, 0
141; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, 1.0, v1
142; GFX10-NEXT:    v_mov_b32_e32 v1, s4
143; GFX10-NEXT:    ; implicit-def: $sgpr6
144; GFX10-NEXT:  .LBB2_1: ; %loop
145; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
146; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v1
147; GFX10-NEXT:    s_xor_b32 s5, s5, -1
148; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
149; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
150; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
151; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
152; GFX10-NEXT:    s_and_b32 s7, exec_lo, s5
153; GFX10-NEXT:    s_or_b32 s6, s6, s7
154; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
155; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
156; GFX10-NEXT:  ; %bb.2: ; %exit
157; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
158; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
159; GFX10-NEXT:    flat_store_dword v[2:3], v0
160; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
161; GFX10-NEXT:    s_setpc_b64 s[30:31]
162entry:
163  %pre.cond = fcmp ogt float %pre.cond.val, 1.0
164  br label %loop
165
166loop:
167  %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
168  %bool.counter = phi i1 [ %pre.cond, %entry ], [ %neg.bool.counter, %loop ]
169  %neg.bool.counter = xor i1 %bool.counter, true
170  %f.counter = uitofp i32 %counter to float
171  %cond = fcmp ogt float %f.counter, %val
172  %counter.plus.1 = add i32 %counter, 1
173  br i1 %cond, label %exit, label %loop
174
175exit:
176  %select = select i1 %neg.bool.counter, float 1.000000e+00, float 0.000000e+00
177  store float %select, ptr %addr
178  ret void
179}
180
181;void xor(int num_elts, int* a, int* addr) {
182;for(int i=0; i<num_elts; ++i) {
183;  if(a[i]==0)
184;    return;
185;}
186;addr[0] = 5
187;return;
188;}
189
190define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ptr addrspace(1) %a, ptr %addr) {
191; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
192; GFX10:       ; %bb.0: ; %entry
193; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; GFX10-NEXT:    s_mov_b32 s5, 0
195; GFX10-NEXT:    s_mov_b32 s6, -1
196; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
197; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
198; GFX10-NEXT:    s_cbranch_execz .LBB3_6
199; GFX10-NEXT:  ; %bb.1: ; %loop.start.preheader
200; GFX10-NEXT:    v_mov_b32_e32 v5, s5
201; GFX10-NEXT:    ; implicit-def: $sgpr6
202; GFX10-NEXT:    ; implicit-def: $sgpr7
203; GFX10-NEXT:    ; implicit-def: $sgpr8
204; GFX10-NEXT:    s_branch .LBB3_3
205; GFX10-NEXT:  .LBB3_2: ; %Flow
206; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
207; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
208; GFX10-NEXT:    s_xor_b32 s9, s8, -1
209; GFX10-NEXT:    s_and_b32 s10, exec_lo, s7
210; GFX10-NEXT:    s_or_b32 s5, s10, s5
211; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
212; GFX10-NEXT:    s_and_b32 s9, exec_lo, s9
213; GFX10-NEXT:    s_or_b32 s6, s6, s9
214; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
215; GFX10-NEXT:    s_cbranch_execz .LBB3_5
216; GFX10-NEXT:  .LBB3_3: ; %loop.start
217; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
218; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
219; GFX10-NEXT:    s_andn2_b32 s8, s8, exec_lo
220; GFX10-NEXT:    s_and_b32 s9, exec_lo, -1
221; GFX10-NEXT:    s_andn2_b32 s7, s7, exec_lo
222; GFX10-NEXT:    s_or_b32 s8, s8, s9
223; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
224; GFX10-NEXT:    s_or_b32 s7, s7, s9
225; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, v6
226; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
227; GFX10-NEXT:    global_load_dword v6, v[6:7], off
228; GFX10-NEXT:    s_waitcnt vmcnt(0)
229; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
230; GFX10-NEXT:    s_and_saveexec_b32 s9, vcc_lo
231; GFX10-NEXT:    s_cbranch_execz .LBB3_2
232; GFX10-NEXT:  ; %bb.4: ; %loop.cond
233; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
234; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v5
235; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v5, v0
236; GFX10-NEXT:    s_andn2_b32 s8, s8, exec_lo
237; GFX10-NEXT:    s_and_b32 s10, exec_lo, 0
238; GFX10-NEXT:    s_andn2_b32 s7, s7, exec_lo
239; GFX10-NEXT:    v_mov_b32_e32 v5, v6
240; GFX10-NEXT:    s_and_b32 s11, exec_lo, vcc_lo
241; GFX10-NEXT:    s_or_b32 s8, s8, s10
242; GFX10-NEXT:    s_or_b32 s7, s7, s11
243; GFX10-NEXT:    s_branch .LBB3_2
244; GFX10-NEXT:  .LBB3_5: ; %loop.exit.guard
245; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
246; GFX10-NEXT:    s_andn2_b32 s5, -1, exec_lo
247; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
248; GFX10-NEXT:    s_or_b32 s6, s5, s6
249; GFX10-NEXT:  .LBB3_6: ; %Flow1
250; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
251; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
252; GFX10-NEXT:    s_cbranch_execz .LBB3_8
253; GFX10-NEXT:  ; %bb.7: ; %block.after.loop
254; GFX10-NEXT:    v_mov_b32_e32 v0, 5
255; GFX10-NEXT:    flat_store_dword v[3:4], v0
256; GFX10-NEXT:  .LBB3_8: ; %exit
257; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
258; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
259; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX10-NEXT:    s_setpc_b64 s[30:31]
261entry:
262  %start.cond = icmp eq i32 %num.elts, 0
263  br i1 %start.cond, label %loop.start, label %block.after.loop
264
265loop.start:
266  %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
267  %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
268  %elt.i = load i32, ptr addrspace(1) %a.plus.i
269  %elt.i.eq.zero = icmp eq i32 %elt.i, 0
270  br i1 %elt.i.eq.zero, label %exit, label %loop.cond
271
272loop.cond:
273  %cond = icmp slt i32 %i, %num.elts
274  %i.plus.1 = add i32 %i, 1
275  br i1 %cond, label %block.after.loop, label %loop.start
276
277block.after.loop:
278  store i32 5, ptr %addr
279  br label %exit
280
281exit:
282  ret void
283}
284
285
286;void icmp(int num_elts, int* a, int* addr) {
287;for(;;) {
288;  if(a[i]==0)
289;    return;
290;}
291;addr[0] = 5
292;return;
293;}
294
295define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr %addr) {
296; GFX10-LABEL: divergent_i1_icmp_used_outside_loop:
297; GFX10:       ; %bb.0: ; %entry
298; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299; GFX10-NEXT:    s_mov_b32 s5, 0
300; GFX10-NEXT:    ; implicit-def: $sgpr6
301; GFX10-NEXT:    v_mov_b32_e32 v4, s5
302; GFX10-NEXT:    s_branch .LBB4_2
303; GFX10-NEXT:  .LBB4_1: ; %Flow
304; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
305; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
306; GFX10-NEXT:    s_and_b32 s4, exec_lo, s7
307; GFX10-NEXT:    s_or_b32 s5, s4, s5
308; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
309; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
310; GFX10-NEXT:    s_or_b32 s6, s4, s6
311; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
312; GFX10-NEXT:    s_cbranch_execz .LBB4_6
313; GFX10-NEXT:  .LBB4_2: ; %cond.block.0
314; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
315; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
316; GFX10-NEXT:    s_and_saveexec_b32 s7, vcc_lo
317; GFX10-NEXT:    s_cbranch_execz .LBB4_4
318; GFX10-NEXT:  ; %bb.3: ; %if.block.0
319; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
320; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
321; GFX10-NEXT:    v_lshlrev_b64 v[8:9], 2, v[4:5]
322; GFX10-NEXT:    v_add_co_u32 v8, s4, v2, v8
323; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
324; GFX10-NEXT:    global_store_dword v[8:9], v4, off
325; GFX10-NEXT:  .LBB4_4: ; %loop.break.block
326; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
327; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
328; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
329; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
330; GFX10-NEXT:    s_mov_b32 s7, -1
331; GFX10-NEXT:    s_and_saveexec_b32 s8, s4
332; GFX10-NEXT:    s_cbranch_execz .LBB4_1
333; GFX10-NEXT:  ; %bb.5: ; %loop.cond
334; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
335; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
336; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
337; GFX10-NEXT:    s_and_b32 s7, exec_lo, 0
338; GFX10-NEXT:    s_or_b32 s7, s4, s7
339; GFX10-NEXT:    s_branch .LBB4_1
340; GFX10-NEXT:  .LBB4_6: ; %cond.block.1
341; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
342; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
343; GFX10-NEXT:    s_cbranch_execz .LBB4_8
344; GFX10-NEXT:  ; %bb.7: ; %if.block.1
345; GFX10-NEXT:    global_store_dword v[6:7], v4, off
346; GFX10-NEXT:  .LBB4_8: ; %exit
347; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
348; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
349; GFX10-NEXT:    s_setpc_b64 s[30:31]
350entry:
351  br label %loop.start
352
353loop.start:
354  %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
355  br label %cond.block.0
356
357cond.block.0:
358  %cond.0 = icmp eq i32 %v0, %i
359  br i1 %cond.0, label %if.block.0, label %loop.break.block
360
361if.block.0:
362  %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
363  store i32 %i, ptr addrspace(1) %a.plus.i
364  br label %loop.break.block
365
366loop.break.block:
367  %cond.1 = icmp eq i32 %v1, %i
368  br i1 %cond.1, label %cond.block.1, label %loop.cond
369
370loop.cond:
371  ; no cond, infinite loop with one break
372  %i.plus.1 = add i32 %i, 1
373  br label %loop.start
374
375cond.block.1:
376  %cond.2 = icmp eq i32 %v0, %i
377  br i1 %cond.2, label %if.block.1, label %exit
378
379if.block.1:
380  store i32 %i, ptr addrspace(1) %c
381  br label %exit
382
383exit:
384  ret void
385}
386
387
388; bool all_eq_zero = true;
389; i32 i = 0;
390; do {
391;   if(all_eq_zero)
392;     all_eq_zero = (a[i] == 0);
393;
394;   i += 1;
395; } while ( i < n )
396
397; *addr = all_eq_zero ? 1.0 : 0.0;
398
399; check that all elements in an array of size n are zero, loop has divergent
400; exit condition based on array size, but zero check does not break out of the
401; loop but instead skips zero check in remaining iterations
402; llpc "freezes" zero check since it is (via phi) used in a conditional branch
403define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspace(1) %a, ptr %addr) {
404; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
405; GFX10:       ; %bb.0: ; %entry
406; GFX10-NEXT:    s_mov_b32 s0, 0
407; GFX10-NEXT:    s_mov_b32 s3, -1
408; GFX10-NEXT:    v_mov_b32_e32 v5, s0
409; GFX10-NEXT:    ; implicit-def: $sgpr1
410; GFX10-NEXT:    ; implicit-def: $sgpr2
411; GFX10-NEXT:    s_branch .LBB5_2
412; GFX10-NEXT:  .LBB5_1: ; %loop.cond
413; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
414; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
415; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v5, v0
416; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v5
417; GFX10-NEXT:    s_or_b32 s0, vcc_lo, s0
418; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
419; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
420; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
421; GFX10-NEXT:    s_or_b32 s3, s3, s4
422; GFX10-NEXT:    s_or_b32 s1, s1, s4
423; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
424; GFX10-NEXT:    s_cbranch_execz .LBB5_4
425; GFX10-NEXT:  .LBB5_2: ; %loop.start
426; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
427; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
428; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
429; GFX10-NEXT:    s_or_b32 s2, s2, s4
430; GFX10-NEXT:    s_and_saveexec_b32 s4, s3
431; GFX10-NEXT:    s_cbranch_execz .LBB5_1
432; GFX10-NEXT:  ; %bb.3: ; %is.eq.zero
433; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
434; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
435; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
436; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
437; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, v6
438; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
439; GFX10-NEXT:    global_load_dword v6, v[6:7], off
440; GFX10-NEXT:    s_waitcnt vmcnt(0)
441; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
442; GFX10-NEXT:    s_and_b32 s3, exec_lo, vcc_lo
443; GFX10-NEXT:    s_or_b32 s2, s2, s3
444; GFX10-NEXT:    ; implicit-def: $sgpr3
445; GFX10-NEXT:    s_branch .LBB5_1
446; GFX10-NEXT:  .LBB5_4: ; %exit
447; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
448; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s1
449; GFX10-NEXT:    flat_store_dword v[3:4], v0
450; GFX10-NEXT:    s_endpgm
451entry:
452  br label %loop.start
453
454loop.start:
455  %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
456  %all.eq.zero = phi i1 [ true, %entry ], [ %eq.zero.fr, %loop.cond ]
457  br i1 %all.eq.zero, label %is.eq.zero, label %loop.cond
458
459is.eq.zero:
460  %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
461  %elt.i = load i32, ptr addrspace(1) %a.plus.i
462  %elt.i.eq.zero = icmp eq i32 %elt.i, 0
463  br label %loop.cond
464
465loop.cond:
466  %eq.zero = phi i1 [ %all.eq.zero, %loop.start ], [ %elt.i.eq.zero, %is.eq.zero ]
467  %eq.zero.fr = freeze i1 %eq.zero
468  %cond = icmp slt i32 %i, %n
469  %i.plus.1 = add i32 %i, 1
470  br i1 %cond, label %exit, label %loop.start
471
472exit:
473  %select = select i1 %eq.zero.fr, float 1.000000e+00, float 0.000000e+00
474  store float %select, ptr %addr
475  ret void
476}
477
478; Divergent i1 phi from structurize-cfg used outside of the loop
479define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
480; GFX10-LABEL: loop_with_1break:
481; GFX10:       ; %bb.0: ; %entry
482; GFX10-NEXT:    s_mov_b32 s0, 0
483; GFX10-NEXT:    ; implicit-def: $sgpr1
484; GFX10-NEXT:    ; implicit-def: $sgpr2
485; GFX10-NEXT:    ; implicit-def: $sgpr3
486; GFX10-NEXT:    v_mov_b32_e32 v6, s0
487; GFX10-NEXT:    s_branch .LBB6_2
488; GFX10-NEXT:  .LBB6_1: ; %Flow
489; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
490; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
491; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
492; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
493; GFX10-NEXT:    s_or_b32 s0, s4, s0
494; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
495; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
496; GFX10-NEXT:    s_or_b32 s1, s1, s4
497; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
498; GFX10-NEXT:    s_cbranch_execz .LBB6_4
499; GFX10-NEXT:  .LBB6_2: ; %A
500; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
501; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
502; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
503; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
504; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
505; GFX10-NEXT:    s_or_b32 s3, s3, s4
506; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
507; GFX10-NEXT:    s_or_b32 s2, s2, s4
508; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
509; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
510; GFX10-NEXT:    global_load_dword v9, v[9:10], off
511; GFX10-NEXT:    s_waitcnt vmcnt(0)
512; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
513; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
514; GFX10-NEXT:    s_cbranch_execz .LBB6_1
515; GFX10-NEXT:  ; %bb.3: ; %loop.body
516; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
517; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
518; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
519; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
520; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
521; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
522; GFX10-NEXT:    global_load_dword v9, v[7:8], off
523; GFX10-NEXT:    s_and_b32 s5, exec_lo, 0
524; GFX10-NEXT:    v_mov_b32_e32 v6, v10
525; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
526; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
527; GFX10-NEXT:    s_or_b32 s3, s3, s5
528; GFX10-NEXT:    s_or_b32 s2, s2, s6
529; GFX10-NEXT:    s_waitcnt vmcnt(0)
530; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
531; GFX10-NEXT:    global_store_dword v[7:8], v9, off
532; GFX10-NEXT:    s_branch .LBB6_1
533; GFX10-NEXT:  .LBB6_4: ; %loop.exit.guard
534; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
535; GFX10-NEXT:    s_and_saveexec_b32 s0, s1
536; GFX10-NEXT:    s_xor_b32 s0, exec_lo, s0
537; GFX10-NEXT:    s_cbranch_execz .LBB6_6
538; GFX10-NEXT:  ; %bb.5: ; %break.body
539; GFX10-NEXT:    v_mov_b32_e32 v0, 10
540; GFX10-NEXT:    global_store_dword v[4:5], v0, off
541; GFX10-NEXT:  .LBB6_6: ; %exit
542; GFX10-NEXT:    s_endpgm
543entry:
544  br label %A
545
546A:
547  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
548  %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
549  %a.val = load i32, ptr addrspace(1) %a.plus.counter
550  %a.cond = icmp eq i32 %a.val, 0
551  br i1 %a.cond, label %break.body, label %loop.body
552
553break.body:
554  store i32 10, ptr addrspace(1) %a.break
555  br label %exit
556
557loop.body:
558  %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
559  %x.val = load i32, ptr addrspace(1) %x.plus.counter
560  %x.val.plus.1 = add i32 %x.val, 1
561  store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
562  %counter.plus.1 = add i32 %counter, 1
563  %x.cond = icmp ult i32 %counter, 100
564  br i1 %x.cond, label %exit, label %A
565
566exit:
567  ret void
568}
569
570