xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll (revision 6c2eec5ceadf26ce8d732d718a8906d075a7d6c7)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
3
4; Simples case, if - then, that requires lane mask merging,
5; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
6; will overwrite its own lane bit in lane mask with val_B
7define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
8; GFX10-LABEL: divergent_i1_phi_if_then:
9; GFX10:       ; %bb.0: ; %A
10; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
11; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
12; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
13; GFX10-NEXT:  ; %bb.1: ; %B
14; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 1, v2
15; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
16; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
17; GFX10-NEXT:    s_or_b32 s0, s0, s2
18; GFX10-NEXT:  ; %bb.2: ; %exit
19; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
20; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
21; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
22; GFX10-NEXT:    global_store_dword v[0:1], v2, off
23; GFX10-NEXT:    s_endpgm
24A:
25  %val_A = icmp uge i32 %tid, 6
26  %cmp = icmp eq i32 %cond, 0
27  br i1 %cmp, label %B, label %exit
28
29B:
30  %val_B = icmp ult i32 %tid, 1
31  br label %exit
32
33exit:
34  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
35  %sel = select i1 %phi, i32 1, i32 2
36  store i32 %sel, ptr addrspace(1) %out
37  ret void
38}
39
40; if - else
41define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
42; GFX10-LABEL: divergent_i1_phi_if_else:
43; GFX10:       ; %bb.0: ; %entry
44; GFX10-NEXT:    s_and_b32 s0, 1, s0
45; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
46; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
47; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
48; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
49; GFX10-NEXT:  ; %bb.1: ; %B
50; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
51; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
52; GFX10-NEXT:    ; implicit-def: $vgpr2
53; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
54; GFX10-NEXT:    s_or_b32 s0, s0, s2
55; GFX10-NEXT:  ; %bb.2: ; %Flow
56; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
57; GFX10-NEXT:  ; %bb.3: ; %A
58; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
59; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
60; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
61; GFX10-NEXT:    s_or_b32 s0, s0, s2
62; GFX10-NEXT:  ; %bb.4: ; %exit
63; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
64; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
65; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
66; GFX10-NEXT:    global_store_dword v[0:1], v2, off
67; GFX10-NEXT:    s_endpgm
68entry:
69  %cmp = icmp eq i32 %cond, 0
70  br i1 %cmp, label %A, label %B
71
72A:
73  %val_A = icmp uge i32 %tid, 1
74  br label %exit
75
76B:
77  %val_B = icmp ult i32 %tid, 2
78  br label %exit
79
80exit:
81  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
82  %sel = select i1 %phi, i32 1, i32 2
83  store i32 %sel, ptr addrspace(1) %out
84  ret void
85}
86
87; if - break;
88
89;  counter = 0;
90;  do {
91;    if (a[counter] == 0)
92;      break;
93;    if (b[counter] == 0)
94;      break;
95;    if (c[counter] == 0)
96;      break;
97;    x[counter++]+=1;
98;  } while (counter<100);
99
100; Tests with multiple break conditions. Divergent phis will be used to track
101; if any of the break conditions was reached. We only need to do simple lane
102; mask merging (for current loop iteration only). There is an intrinsic,
103; if_break, that will merge lane masks across all iterations of the loop.
104
105define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
106; GFX10-LABEL: loop_with_1break:
107; GFX10:       ; %bb.0: ; %entry
108; GFX10-NEXT:    s_mov_b32 s0, 0
109; GFX10-NEXT:    ; implicit-def: $sgpr1
110; GFX10-NEXT:    v_mov_b32_e32 v4, s0
111; GFX10-NEXT:    s_branch .LBB2_2
112; GFX10-NEXT:  .LBB2_1: ; %Flow
113; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
114; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
115; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
116; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
117; GFX10-NEXT:    s_or_b32 s0, s2, s0
118; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
119; GFX10-NEXT:    s_cbranch_execz .LBB2_4
120; GFX10-NEXT:  .LBB2_2: ; %A
121; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
122; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
123; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
124; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
125; GFX10-NEXT:    s_or_b32 s1, s1, s2
126; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 2, v[4:5]
127; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v2, v5
128; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo
129; GFX10-NEXT:    global_load_dword v7, v[7:8], off
130; GFX10-NEXT:    s_waitcnt vmcnt(0)
131; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
132; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
133; GFX10-NEXT:    s_cbranch_execz .LBB2_1
134; GFX10-NEXT:  ; %bb.3: ; %loop.body
135; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
136; GFX10-NEXT:    v_add_co_u32 v5, vcc_lo, v0, v5
137; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo
138; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v4
139; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v4
140; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
141; GFX10-NEXT:    global_load_dword v7, v[5:6], off
142; GFX10-NEXT:    v_mov_b32_e32 v4, v8
143; GFX10-NEXT:    s_and_b32 s3, exec_lo, vcc_lo
144; GFX10-NEXT:    s_or_b32 s1, s1, s3
145; GFX10-NEXT:    s_waitcnt vmcnt(0)
146; GFX10-NEXT:    v_add_nc_u32_e32 v7, 1, v7
147; GFX10-NEXT:    global_store_dword v[5:6], v7, off
148; GFX10-NEXT:    s_branch .LBB2_1
149; GFX10-NEXT:  .LBB2_4: ; %exit
150; GFX10-NEXT:    s_endpgm
151entry:
152  br label %A
153
154A:
155  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
156  %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
157  %a.val = load i32, ptr addrspace(1) %a.plus.counter
158  %a.cond = icmp eq i32 %a.val, 0
159  br i1 %a.cond, label %exit, label %loop.body
160
161loop.body:
162  %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
163  %x.val = load i32, ptr addrspace(1) %x.plus.counter
164  %x.val.plus.1 = add i32 %x.val, 1
165  store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
166  %counter.plus.1 = add i32 %counter, 1
167  %x.cond = icmp ult i32 %counter, 100
168  br i1 %x.cond, label %exit, label %A
169
170exit:
171  ret void
172}
173
174define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
175; GFX10-LABEL: loop_with_2breaks:
176; GFX10:       ; %bb.0: ; %entry
177; GFX10-NEXT:    s_mov_b32 s0, 0
178; GFX10-NEXT:    ; implicit-def: $sgpr1
179; GFX10-NEXT:    v_mov_b32_e32 v6, s0
180; GFX10-NEXT:    s_branch .LBB3_3
181; GFX10-NEXT:  .LBB3_1: ; %Flow3
182; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
183; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
184; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
185; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
186; GFX10-NEXT:    s_and_b32 s3, exec_lo, s4
187; GFX10-NEXT:    s_or_b32 s1, s1, s3
188; GFX10-NEXT:  .LBB3_2: ; %Flow
189; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
190; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
191; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
192; GFX10-NEXT:    s_or_b32 s0, s2, s0
193; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
194; GFX10-NEXT:    s_cbranch_execz .LBB3_6
195; GFX10-NEXT:  .LBB3_3: ; %A
196; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
197; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
198; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
199; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
200; GFX10-NEXT:    s_or_b32 s1, s1, s2
201; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
202; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
203; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
204; GFX10-NEXT:    global_load_dword v9, v[9:10], off
205; GFX10-NEXT:    s_waitcnt vmcnt(0)
206; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
207; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
208; GFX10-NEXT:    s_cbranch_execz .LBB3_2
209; GFX10-NEXT:  ; %bb.4: ; %B
210; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
211; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
212; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
213; GFX10-NEXT:    s_mov_b32 s4, -1
214; GFX10-NEXT:    global_load_dword v9, v[9:10], off
215; GFX10-NEXT:    s_waitcnt vmcnt(0)
216; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
217; GFX10-NEXT:    s_and_saveexec_b32 s3, vcc_lo
218; GFX10-NEXT:    s_cbranch_execz .LBB3_1
219; GFX10-NEXT:  ; %bb.5: ; %loop.body
220; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
221; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
222; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
223; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
224; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
225; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
226; GFX10-NEXT:    global_load_dword v9, v[7:8], off
227; GFX10-NEXT:    v_mov_b32_e32 v6, v10
228; GFX10-NEXT:    s_and_b32 s5, exec_lo, vcc_lo
229; GFX10-NEXT:    s_or_b32 s4, s4, s5
230; GFX10-NEXT:    s_waitcnt vmcnt(0)
231; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
232; GFX10-NEXT:    global_store_dword v[7:8], v9, off
233; GFX10-NEXT:    s_branch .LBB3_1
234; GFX10-NEXT:  .LBB3_6: ; %exit
235; GFX10-NEXT:    s_endpgm
236entry:
237  br label %A
238
239A:
240  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
241  %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
242  %a.val = load i32, ptr addrspace(1) %a.plus.counter
243  %a.cond = icmp eq i32 %a.val, 0
244  br i1 %a.cond, label %exit, label %B
245
246B:
247  %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
248  %b.val = load i32, ptr addrspace(1) %b.plus.counter
249  %b.cond = icmp eq i32 %b.val, 0
250  br i1 %b.cond, label %exit, label %loop.body
251
252loop.body:
253  %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
254  %x.val = load i32, ptr addrspace(1) %x.plus.counter
255  %x.val.plus.1 = add i32 %x.val, 1
256  store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
257  %counter.plus.1 = add i32 %counter, 1
258  %x.cond = icmp ult i32 %counter, 100
259  br i1 %x.cond, label %exit, label %A
260
261exit:
262  ret void
263}
264
265define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
266; GFX10-LABEL: loop_with_3breaks:
267; GFX10:       ; %bb.0: ; %entry
268; GFX10-NEXT:    s_mov_b32 s0, 0
269; GFX10-NEXT:    ; implicit-def: $sgpr1
270; GFX10-NEXT:    v_mov_b32_e32 v8, s0
271; GFX10-NEXT:    s_branch .LBB4_4
272; GFX10-NEXT:  .LBB4_1: ; %Flow5
273; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
274; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
275; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
276; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
277; GFX10-NEXT:    s_and_b32 s5, exec_lo, s5
278; GFX10-NEXT:    s_or_b32 s4, s4, s5
279; GFX10-NEXT:  .LBB4_2: ; %Flow4
280; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
281; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
282; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
283; GFX10-NEXT:    s_and_b32 s3, exec_lo, s4
284; GFX10-NEXT:    s_or_b32 s1, s1, s3
285; GFX10-NEXT:  .LBB4_3: ; %Flow
286; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
287; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
288; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
289; GFX10-NEXT:    s_or_b32 s0, s2, s0
290; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
291; GFX10-NEXT:    s_cbranch_execz .LBB4_8
292; GFX10-NEXT:  .LBB4_4: ; %A
293; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
294; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
295; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
296; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
297; GFX10-NEXT:    s_or_b32 s1, s1, s2
298; GFX10-NEXT:    v_lshlrev_b64 v[9:10], 2, v[8:9]
299; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v2, v9
300; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo
301; GFX10-NEXT:    global_load_dword v11, v[11:12], off
302; GFX10-NEXT:    s_waitcnt vmcnt(0)
303; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
304; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
305; GFX10-NEXT:    s_cbranch_execz .LBB4_3
306; GFX10-NEXT:  ; %bb.5: ; %B
307; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
308; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v4, v9
309; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
310; GFX10-NEXT:    s_mov_b32 s4, -1
311; GFX10-NEXT:    global_load_dword v11, v[11:12], off
312; GFX10-NEXT:    s_waitcnt vmcnt(0)
313; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
314; GFX10-NEXT:    s_and_saveexec_b32 s3, vcc_lo
315; GFX10-NEXT:    s_cbranch_execz .LBB4_2
316; GFX10-NEXT:  ; %bb.6: ; %C
317; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
318; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v6, v9
319; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
320; GFX10-NEXT:    s_mov_b32 s5, -1
321; GFX10-NEXT:    global_load_dword v11, v[11:12], off
322; GFX10-NEXT:    s_waitcnt vmcnt(0)
323; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
324; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
325; GFX10-NEXT:    s_cbranch_execz .LBB4_1
326; GFX10-NEXT:  ; %bb.7: ; %loop.body
327; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
328; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v0, v9
329; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo
330; GFX10-NEXT:    v_add_nc_u32_e32 v12, 1, v8
331; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v8
332; GFX10-NEXT:    s_andn2_b32 s5, -1, exec_lo
333; GFX10-NEXT:    global_load_dword v11, v[9:10], off
334; GFX10-NEXT:    v_mov_b32_e32 v8, v12
335; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
336; GFX10-NEXT:    s_or_b32 s5, s5, s6
337; GFX10-NEXT:    s_waitcnt vmcnt(0)
338; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v11
339; GFX10-NEXT:    global_store_dword v[9:10], v11, off
340; GFX10-NEXT:    s_branch .LBB4_1
341; GFX10-NEXT:  .LBB4_8: ; %exit
342; GFX10-NEXT:    s_endpgm
343entry:
344  br label %A
345
346A:
347  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
348  %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
349  %a.val = load i32, ptr addrspace(1) %a.plus.counter
350  %a.cond = icmp eq i32 %a.val, 0
351  br i1 %a.cond, label %exit, label %B
352
353B:
354  %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
355  %b.val = load i32, ptr addrspace(1) %b.plus.counter
356  %b.cond = icmp eq i32 %b.val, 0
357  br i1 %b.cond, label %exit, label %C
358
359C:
360  %c.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %c, i32 %counter
361  %c.val = load i32, ptr addrspace(1) %c.plus.counter
362  %c.cond = icmp eq i32 %c.val, 0
363  br i1 %c.cond, label %exit, label %loop.body
364
365loop.body:
366  %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
367  %x.val = load i32, ptr addrspace(1) %x.plus.counter
368  %x.val.plus.1 = add i32 %x.val, 1
369  store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
370  %counter.plus.1 = add i32 %counter, 1
371  %x.cond = icmp ult i32 %counter, 100
372  br i1 %x.cond, label %exit, label %A
373
374exit:
375  ret void
376}
377
378; Divergent condition if with body, ending with break. This is loop with two
379; exits but structurizer will create phi that will track exit from break
380; and move break.body after the loop. Loop will then have one exit and phi
381; used outside of the loop by condition used to enter the break.body.
382define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
383; GFX10-LABEL: loop_with_div_break_with_body:
384; GFX10:       ; %bb.0: ; %entry
385; GFX10-NEXT:    s_mov_b32 s0, 0
386; GFX10-NEXT:    ; implicit-def: $sgpr1
387; GFX10-NEXT:    ; implicit-def: $sgpr2
388; GFX10-NEXT:    ; implicit-def: $sgpr3
389; GFX10-NEXT:    v_mov_b32_e32 v6, s0
390; GFX10-NEXT:    s_branch .LBB5_2
391; GFX10-NEXT:  .LBB5_1: ; %Flow
392; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
393; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
394; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
395; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
396; GFX10-NEXT:    s_or_b32 s0, s4, s0
397; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
398; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
399; GFX10-NEXT:    s_or_b32 s1, s1, s4
400; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
401; GFX10-NEXT:    s_cbranch_execz .LBB5_4
402; GFX10-NEXT:  .LBB5_2: ; %A
403; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
404; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
405; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
406; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
407; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
408; GFX10-NEXT:    s_or_b32 s3, s3, s4
409; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
410; GFX10-NEXT:    s_or_b32 s2, s2, s4
411; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
412; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
413; GFX10-NEXT:    global_load_dword v9, v[9:10], off
414; GFX10-NEXT:    s_waitcnt vmcnt(0)
415; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
416; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
417; GFX10-NEXT:    s_cbranch_execz .LBB5_1
418; GFX10-NEXT:  ; %bb.3: ; %loop.body
419; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
420; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
421; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
422; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
423; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
424; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
425; GFX10-NEXT:    global_load_dword v9, v[7:8], off
426; GFX10-NEXT:    s_and_b32 s5, exec_lo, 0
427; GFX10-NEXT:    v_mov_b32_e32 v6, v10
428; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
429; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
430; GFX10-NEXT:    s_or_b32 s3, s3, s5
431; GFX10-NEXT:    s_or_b32 s2, s2, s6
432; GFX10-NEXT:    s_waitcnt vmcnt(0)
433; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
434; GFX10-NEXT:    global_store_dword v[7:8], v9, off
435; GFX10-NEXT:    s_branch .LBB5_1
436; GFX10-NEXT:  .LBB5_4: ; %loop.exit.guard
437; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
438; GFX10-NEXT:    s_and_saveexec_b32 s0, s1
439; GFX10-NEXT:    s_xor_b32 s0, exec_lo, s0
440; GFX10-NEXT:    s_cbranch_execz .LBB5_6
441; GFX10-NEXT:  ; %bb.5: ; %break.body
442; GFX10-NEXT:    v_mov_b32_e32 v0, 10
443; GFX10-NEXT:    global_store_dword v[4:5], v0, off
444; GFX10-NEXT:  .LBB5_6: ; %exit
445; GFX10-NEXT:    s_endpgm
446entry:
447  br label %A
448
449A:
450  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
451  %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
452  %a.val = load i32, ptr addrspace(1) %a.plus.counter
453  %a.cond = icmp eq i32 %a.val, 0
454  br i1 %a.cond, label %break.body, label %loop.body
455
456break.body:
457  store i32 10, ptr addrspace(1) %a.break
458  br label %exit
459
460
461loop.body:
462  %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
463  %x.val = load i32, ptr addrspace(1) %x.plus.counter
464  %x.val.plus.1 = add i32 %x.val, 1
465  store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
466  %counter.plus.1 = add i32 %counter, 1
467  %x.cond = icmp ult i32 %counter, 100
468  br i1 %x.cond, label %exit, label %A
469
470exit:
471  ret void
472}
473
474; Snippet from test generated by the GraphicsFuzz tool, frontend generates ir
475; with irreducible control flow graph. FixIrreducible converts it into natural
476; loop and in the process creates i1 phi with three incoming values.
477
478; int loop(int x, int y, int a0, int a1, int a2, int a3, int a4) {
479;   do {
480;     if (y < a2) {
481;       do {
482;       } while (x < a2);
483;     }
484;     if (x < a3) {
485;       return a1;
486;     }
487;   } while (y < a2);
488;   return a0;
489; }
490
491; This test is also interesting because it has phi with three incomings
492;define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
493;.entry:
494; %.y_lt_a2 = icmp sgt i32 %a2, %y
495; %.x_lt_a2 = icmp sgt i32 %a2, %x
496; %.x_lt_a3 = icmp sgt i32 %a3, %x
497; br i1 %.y_lt_a2, label %.preheader, label %.loopexit ; first iteration, jump to inner loop if 'y < a2' or start with 'if (x < a3)'
498;
499;.preheader: ; if (y < a2),
500; br label %.inner_loop
501;
502;.inner_loop: ; do while x < a2
503; br i1 %.x_lt_a2, label %.inner_loop, label %.loopexit
504;
505;.loopexit: ; if x < a3
506; %not.inner_loop = xor i1 %.y_lt_a2, true
507; %brmerge = select i1 %.x_lt_a3, i1 true, i1 %not.inner_loop ; exit loop if 'x < a3' or 'loop ends since !(y < a2)'
508; %.ret = select i1 %.x_lt_a3, i32 %a1, i32 %a0               ; select retrun value a1 'x < a3' or a0 'loop ends'
509; br i1 %brmerge, label %.exit, label %.preheader
510;
511;.exit:
512; ret i32 %.ret
513;}
514
515