1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 3 4; This file contains various tests that have divergent i1s used outside of 5; the loop. These are lane masks is sgpr and need to have correct value in 6; corresponding bit at the iteration lane exits the loop. 7; Achieved by merging lane mask with same lane mask from previous iteration 8; and using that merged lane mask outside of the loop. 9 10; Phi used outside of the loop directly (loopfinder will figure out that it 11; needs to merge lane mask across all iterations) 12define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ptr %addr) { 13; GFX10-LABEL: divergent_i1_phi_used_outside_loop: 14; GFX10: ; %bb.0: ; %entry 15; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1 17; GFX10-NEXT: s_mov_b32 s4, 0 18; GFX10-NEXT: v_mov_b32_e32 v1, s4 19; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo 20; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo 21; GFX10-NEXT: s_or_b32 s6, s5, s6 22; GFX10-NEXT: ; implicit-def: $sgpr5 23; GFX10-NEXT: .LBB0_1: ; %loop 24; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 25; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1 26; GFX10-NEXT: s_xor_b32 s7, s6, -1 27; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 28; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 29; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 30; GFX10-NEXT: s_andn2_b32 s8, s6, exec_lo 31; GFX10-NEXT: s_and_b32 s7, exec_lo, s7 32; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo 33; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 34; GFX10-NEXT: s_or_b32 s7, s8, s7 35; GFX10-NEXT: s_or_b32 s5, s5, s6 36; GFX10-NEXT: s_mov_b32 s6, s7 37; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 38; GFX10-NEXT: s_cbranch_execnz .LBB0_1 39; GFX10-NEXT: ; %bb.2: ; %exit 40; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 41; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 42; GFX10-NEXT: flat_store_dword v[2:3], v0 43; GFX10-NEXT: s_waitcnt lgkmcnt(0) 44; GFX10-NEXT: s_setpc_b64 s[30:31] 45entry: 46 %pre.cond = fcmp ogt float %pre.cond.val, 1.0 47 br label %loop 48 49loop: 50 %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ] 51 %bool.counter = phi i1 [ %pre.cond, %entry ], [ %neg.bool.counter, %loop ] 52 %neg.bool.counter = xor i1 %bool.counter, true 53 %f.counter = uitofp i32 %counter to float 54 %cond = fcmp ogt float %f.counter, %val 55 %counter.plus.1 = add i32 %counter, 1 56 br i1 %cond, label %exit, label %loop 57 58exit: 59 %select = select i1 %bool.counter, float 1.000000e+00, float 0.000000e+00 60 store float %select, ptr %addr 61 ret void 62} 63 64define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr addrspace(1) %a, ptr %addr) { 65; GFX10-LABEL: divergent_i1_phi_used_outside_loop_larger_loop_body: 66; GFX10: ; %bb.0: ; %entry 67; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 68; GFX10-NEXT: s_mov_b32 s4, -1 69; GFX10-NEXT: ; implicit-def: $sgpr6 70; GFX10-NEXT: v_mov_b32_e32 v0, s4 71; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo 72; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 73; GFX10-NEXT: s_or_b32 s4, s5, s4 74; GFX10-NEXT: s_branch .LBB1_2 75; GFX10-NEXT: .LBB1_1: ; %loop.cond 76; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 77; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 78; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4 79; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 80; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4 81; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo 82; GFX10-NEXT: s_and_b32 s8, exec_lo, s6 83; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0 84; GFX10-NEXT: s_or_b32 s4, s7, s8 85; GFX10-NEXT: s_cbranch_vccz .LBB1_4 86; GFX10-NEXT: .LBB1_2: ; %loop.start 87; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 88; GFX10-NEXT: s_mov_b32 s5, s4 89; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo 90; GFX10-NEXT: s_and_b32 s6, exec_lo, s5 91; GFX10-NEXT: s_or_b32 s6, s4, s6 92; GFX10-NEXT: s_and_saveexec_b32 s4, s5 93; GFX10-NEXT: s_cbranch_execz .LBB1_1 94; GFX10-NEXT: ; %bb.3: ; %is.eq.zero 95; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 96; GFX10-NEXT: global_load_dword v5, v[1:2], off 97; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo 98; GFX10-NEXT: s_waitcnt vmcnt(0) 99; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 100; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo 101; GFX10-NEXT: s_or_b32 s6, s6, s7 102; GFX10-NEXT: s_branch .LBB1_1 103; GFX10-NEXT: .LBB1_4: ; %exit 104; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 105; GFX10-NEXT: flat_store_dword v[3:4], v0 106; GFX10-NEXT: s_waitcnt lgkmcnt(0) 107; GFX10-NEXT: s_setpc_b64 s[30:31] 108entry: 109 br label %loop.start 110 111loop.start: 112 %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ] 113 %all.eq.zero = phi i1 [ true, %entry ], [ %eq.zero, %loop.cond ] 114 br i1 %all.eq.zero, label %is.eq.zero, label %loop.cond 115 116is.eq.zero: 117 %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i 118 %elt.i = load i32, ptr addrspace(1) %a.plus.i 119 %elt.i.eq.zero = icmp eq i32 %elt.i, 0 120 br label %loop.cond 121 122loop.cond: 123 %eq.zero = phi i1 [ %all.eq.zero, %loop.start ], [ %elt.i.eq.zero, %is.eq.zero ] 124 %cond = icmp slt i32 %i, 10 125 %i.plus.1 = add i32 %i, 1 126 br i1 %cond, label %exit, label %loop.start 127 128exit: 129 %select = select i1 %all.eq.zero, float 1.000000e+00, float 0.000000e+00 130 store float %select, ptr %addr 131 ret void 132} 133 134; Non-phi used outside of the loop 135 136define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ptr %addr) { 137; GFX10-LABEL: divergent_i1_xor_used_outside_loop: 138; GFX10: ; %bb.0: ; %entry 139; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 140; GFX10-NEXT: s_mov_b32 s4, 0 141; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 142; GFX10-NEXT: v_mov_b32_e32 v1, s4 143; GFX10-NEXT: ; implicit-def: $sgpr6 144; GFX10-NEXT: .LBB2_1: ; %loop 145; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 146; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1 147; GFX10-NEXT: s_xor_b32 s5, s5, -1 148; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 149; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 150; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 151; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo 152; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 153; GFX10-NEXT: s_or_b32 s6, s6, s7 154; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 155; GFX10-NEXT: s_cbranch_execnz .LBB2_1 156; GFX10-NEXT: ; %bb.2: ; %exit 157; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 158; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 159; GFX10-NEXT: flat_store_dword v[2:3], v0 160; GFX10-NEXT: s_waitcnt lgkmcnt(0) 161; GFX10-NEXT: s_setpc_b64 s[30:31] 162entry: 163 %pre.cond = fcmp ogt float %pre.cond.val, 1.0 164 br label %loop 165 166loop: 167 %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ] 168 %bool.counter = phi i1 [ %pre.cond, %entry ], [ %neg.bool.counter, %loop ] 169 %neg.bool.counter = xor i1 %bool.counter, true 170 %f.counter = uitofp i32 %counter to float 171 %cond = fcmp ogt float %f.counter, %val 172 %counter.plus.1 = add i32 %counter, 1 173 br i1 %cond, label %exit, label %loop 174 175exit: 176 %select = select i1 %neg.bool.counter, float 1.000000e+00, float 0.000000e+00 177 store float %select, ptr %addr 178 ret void 179} 180 181;void xor(int num_elts, int* a, int* addr) { 182;for(int i=0; i<num_elts; ++i) { 183; if(a[i]==0) 184; return; 185;} 186;addr[0] = 5 187;return; 188;} 189 190define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ptr addrspace(1) %a, ptr %addr) { 191; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body: 192; GFX10: ; %bb.0: ; %entry 193; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; GFX10-NEXT: s_mov_b32 s5, 0 195; GFX10-NEXT: s_mov_b32 s6, -1 196; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 197; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 198; GFX10-NEXT: s_cbranch_execz .LBB3_6 199; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader 200; GFX10-NEXT: v_mov_b32_e32 v5, s5 201; GFX10-NEXT: ; implicit-def: $sgpr6 202; GFX10-NEXT: ; implicit-def: $sgpr7 203; GFX10-NEXT: ; implicit-def: $sgpr8 204; GFX10-NEXT: s_branch .LBB3_3 205; GFX10-NEXT: .LBB3_2: ; %Flow 206; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 207; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 208; GFX10-NEXT: s_xor_b32 s9, s8, -1 209; GFX10-NEXT: s_and_b32 s10, exec_lo, s7 210; GFX10-NEXT: s_or_b32 s5, s10, s5 211; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo 212; GFX10-NEXT: s_and_b32 s9, exec_lo, s9 213; GFX10-NEXT: s_or_b32 s6, s6, s9 214; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 215; GFX10-NEXT: s_cbranch_execz .LBB3_5 216; GFX10-NEXT: .LBB3_3: ; %loop.start 217; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 218; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 219; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo 220; GFX10-NEXT: s_and_b32 s9, exec_lo, -1 221; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo 222; GFX10-NEXT: s_or_b32 s8, s8, s9 223; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] 224; GFX10-NEXT: s_or_b32 s7, s7, s9 225; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6 226; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo 227; GFX10-NEXT: global_load_dword v6, v[6:7], off 228; GFX10-NEXT: s_waitcnt vmcnt(0) 229; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 230; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo 231; GFX10-NEXT: s_cbranch_execz .LBB3_2 232; GFX10-NEXT: ; %bb.4: ; %loop.cond 233; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 234; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5 235; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0 236; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo 237; GFX10-NEXT: s_and_b32 s10, exec_lo, 0 238; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo 239; GFX10-NEXT: v_mov_b32_e32 v5, v6 240; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo 241; GFX10-NEXT: s_or_b32 s8, s8, s10 242; GFX10-NEXT: s_or_b32 s7, s7, s11 243; GFX10-NEXT: s_branch .LBB3_2 244; GFX10-NEXT: .LBB3_5: ; %loop.exit.guard 245; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 246; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo 247; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 248; GFX10-NEXT: s_or_b32 s6, s5, s6 249; GFX10-NEXT: .LBB3_6: ; %Flow1 250; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 251; GFX10-NEXT: s_and_saveexec_b32 s4, s6 252; GFX10-NEXT: s_cbranch_execz .LBB3_8 253; GFX10-NEXT: ; %bb.7: ; %block.after.loop 254; GFX10-NEXT: v_mov_b32_e32 v0, 5 255; GFX10-NEXT: flat_store_dword v[3:4], v0 256; GFX10-NEXT: .LBB3_8: ; %exit 257; GFX10-NEXT: s_waitcnt_depctr 0xffe3 258; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 259; GFX10-NEXT: s_waitcnt lgkmcnt(0) 260; GFX10-NEXT: s_setpc_b64 s[30:31] 261entry: 262 %start.cond = icmp eq i32 %num.elts, 0 263 br i1 %start.cond, label %loop.start, label %block.after.loop 264 265loop.start: 266 %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ] 267 %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i 268 %elt.i = load i32, ptr addrspace(1) %a.plus.i 269 %elt.i.eq.zero = icmp eq i32 %elt.i, 0 270 br i1 %elt.i.eq.zero, label %exit, label %loop.cond 271 272loop.cond: 273 %cond = icmp slt i32 %i, %num.elts 274 %i.plus.1 = add i32 %i, 1 275 br i1 %cond, label %block.after.loop, label %loop.start 276 277block.after.loop: 278 store i32 5, ptr %addr 279 br label %exit 280 281exit: 282 ret void 283} 284 285 286;void icmp(int num_elts, int* a, int* addr) { 287;for(;;) { 288; if(a[i]==0) 289; return; 290;} 291;addr[0] = 5 292;return; 293;} 294 295define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr %addr) { 296; GFX10-LABEL: divergent_i1_icmp_used_outside_loop: 297; GFX10: ; %bb.0: ; %entry 298; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 299; GFX10-NEXT: s_mov_b32 s5, 0 300; GFX10-NEXT: ; implicit-def: $sgpr6 301; GFX10-NEXT: v_mov_b32_e32 v4, s5 302; GFX10-NEXT: s_branch .LBB4_2 303; GFX10-NEXT: .LBB4_1: ; %Flow 304; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 305; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 306; GFX10-NEXT: s_and_b32 s4, exec_lo, s7 307; GFX10-NEXT: s_or_b32 s5, s4, s5 308; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo 309; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo 310; GFX10-NEXT: s_or_b32 s6, s4, s6 311; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 312; GFX10-NEXT: s_cbranch_execz .LBB4_6 313; GFX10-NEXT: .LBB4_2: ; %cond.block.0 314; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 315; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 316; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo 317; GFX10-NEXT: s_cbranch_execz .LBB4_4 318; GFX10-NEXT: ; %bb.3: ; %if.block.0 319; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 320; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 321; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5] 322; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8 323; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4 324; GFX10-NEXT: global_store_dword v[8:9], v4, off 325; GFX10-NEXT: .LBB4_4: ; %loop.break.block 326; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 327; GFX10-NEXT: s_waitcnt_depctr 0xffe3 328; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 329; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4 330; GFX10-NEXT: s_mov_b32 s7, -1 331; GFX10-NEXT: s_and_saveexec_b32 s8, s4 332; GFX10-NEXT: s_cbranch_execz .LBB4_1 333; GFX10-NEXT: ; %bb.5: ; %loop.cond 334; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 335; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4 336; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo 337; GFX10-NEXT: s_and_b32 s7, exec_lo, 0 338; GFX10-NEXT: s_or_b32 s7, s4, s7 339; GFX10-NEXT: s_branch .LBB4_1 340; GFX10-NEXT: .LBB4_6: ; %cond.block.1 341; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 342; GFX10-NEXT: s_and_saveexec_b32 s4, s6 343; GFX10-NEXT: s_cbranch_execz .LBB4_8 344; GFX10-NEXT: ; %bb.7: ; %if.block.1 345; GFX10-NEXT: global_store_dword v[6:7], v4, off 346; GFX10-NEXT: .LBB4_8: ; %exit 347; GFX10-NEXT: s_waitcnt_depctr 0xffe3 348; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 349; GFX10-NEXT: s_setpc_b64 s[30:31] 350entry: 351 br label %loop.start 352 353loop.start: 354 %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ] 355 br label %cond.block.0 356 357cond.block.0: 358 %cond.0 = icmp eq i32 %v0, %i 359 br i1 %cond.0, label %if.block.0, label %loop.break.block 360 361if.block.0: 362 %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i 363 store i32 %i, ptr addrspace(1) %a.plus.i 364 br label %loop.break.block 365 366loop.break.block: 367 %cond.1 = icmp eq i32 %v1, %i 368 br i1 %cond.1, label %cond.block.1, label %loop.cond 369 370loop.cond: 371 ; no cond, infinite loop with one break 372 %i.plus.1 = add i32 %i, 1 373 br label %loop.start 374 375cond.block.1: 376 %cond.2 = icmp eq i32 %v0, %i 377 br i1 %cond.2, label %if.block.1, label %exit 378 379if.block.1: 380 store i32 %i, ptr addrspace(1) %c 381 br label %exit 382 383exit: 384 ret void 385} 386 387 388; bool all_eq_zero = true; 389; i32 i = 0; 390; do { 391; if(all_eq_zero) 392; all_eq_zero = (a[i] == 0); 393; 394; i += 1; 395; } while ( i < n ) 396 397; *addr = all_eq_zero ? 1.0 : 0.0; 398 399; check that all elements in an array of size n are zero, loop has divergent 400; exit condition based on array size, but zero check does not break out of the 401; loop but instead skips zero check in remaining iterations 402; llpc "freezes" zero check since it is (via phi) used in a conditional branch 403define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspace(1) %a, ptr %addr) { 404; GFX10-LABEL: divergent_i1_freeze_used_outside_loop: 405; GFX10: ; %bb.0: ; %entry 406; GFX10-NEXT: s_mov_b32 s0, 0 407; GFX10-NEXT: s_mov_b32 s3, -1 408; GFX10-NEXT: v_mov_b32_e32 v5, s0 409; GFX10-NEXT: ; implicit-def: $sgpr1 410; GFX10-NEXT: ; implicit-def: $sgpr2 411; GFX10-NEXT: s_branch .LBB5_2 412; GFX10-NEXT: .LBB5_1: ; %loop.cond 413; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 414; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 415; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0 416; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5 417; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 418; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo 419; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 420; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 421; GFX10-NEXT: s_or_b32 s3, s3, s4 422; GFX10-NEXT: s_or_b32 s1, s1, s4 423; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 424; GFX10-NEXT: s_cbranch_execz .LBB5_4 425; GFX10-NEXT: .LBB5_2: ; %loop.start 426; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 427; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo 428; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 429; GFX10-NEXT: s_or_b32 s2, s2, s4 430; GFX10-NEXT: s_and_saveexec_b32 s4, s3 431; GFX10-NEXT: s_cbranch_execz .LBB5_1 432; GFX10-NEXT: ; %bb.3: ; %is.eq.zero 433; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 434; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 435; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo 436; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] 437; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6 438; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo 439; GFX10-NEXT: global_load_dword v6, v[6:7], off 440; GFX10-NEXT: s_waitcnt vmcnt(0) 441; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6 442; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo 443; GFX10-NEXT: s_or_b32 s2, s2, s3 444; GFX10-NEXT: ; implicit-def: $sgpr3 445; GFX10-NEXT: s_branch .LBB5_1 446; GFX10-NEXT: .LBB5_4: ; %exit 447; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 448; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 449; GFX10-NEXT: flat_store_dword v[3:4], v0 450; GFX10-NEXT: s_endpgm 451entry: 452 br label %loop.start 453 454loop.start: 455 %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ] 456 %all.eq.zero = phi i1 [ true, %entry ], [ %eq.zero.fr, %loop.cond ] 457 br i1 %all.eq.zero, label %is.eq.zero, label %loop.cond 458 459is.eq.zero: 460 %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i 461 %elt.i = load i32, ptr addrspace(1) %a.plus.i 462 %elt.i.eq.zero = icmp eq i32 %elt.i, 0 463 br label %loop.cond 464 465loop.cond: 466 %eq.zero = phi i1 [ %all.eq.zero, %loop.start ], [ %elt.i.eq.zero, %is.eq.zero ] 467 %eq.zero.fr = freeze i1 %eq.zero 468 %cond = icmp slt i32 %i, %n 469 %i.plus.1 = add i32 %i, 1 470 br i1 %cond, label %exit, label %loop.start 471 472exit: 473 %select = select i1 %eq.zero.fr, float 1.000000e+00, float 0.000000e+00 474 store float %select, ptr %addr 475 ret void 476} 477 478; Divergent i1 phi from structurize-cfg used outside of the loop 479define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) { 480; GFX10-LABEL: loop_with_1break: 481; GFX10: ; %bb.0: ; %entry 482; GFX10-NEXT: s_mov_b32 s0, 0 483; GFX10-NEXT: ; implicit-def: $sgpr1 484; GFX10-NEXT: ; implicit-def: $sgpr2 485; GFX10-NEXT: ; implicit-def: $sgpr3 486; GFX10-NEXT: v_mov_b32_e32 v6, s0 487; GFX10-NEXT: s_branch .LBB6_2 488; GFX10-NEXT: .LBB6_1: ; %Flow 489; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 490; GFX10-NEXT: s_waitcnt_depctr 0xffe3 491; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 492; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 493; GFX10-NEXT: s_or_b32 s0, s4, s0 494; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 495; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 496; GFX10-NEXT: s_or_b32 s1, s1, s4 497; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 498; GFX10-NEXT: s_cbranch_execz .LBB6_4 499; GFX10-NEXT: .LBB6_2: ; %A 500; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 501; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 502; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo 503; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 504; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo 505; GFX10-NEXT: s_or_b32 s3, s3, s4 506; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] 507; GFX10-NEXT: s_or_b32 s2, s2, s4 508; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 509; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo 510; GFX10-NEXT: global_load_dword v9, v[9:10], off 511; GFX10-NEXT: s_waitcnt vmcnt(0) 512; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 513; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 514; GFX10-NEXT: s_cbranch_execz .LBB6_1 515; GFX10-NEXT: ; %bb.3: ; %loop.body 516; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 517; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 518; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo 519; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 520; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 521; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo 522; GFX10-NEXT: global_load_dword v9, v[7:8], off 523; GFX10-NEXT: s_and_b32 s5, exec_lo, 0 524; GFX10-NEXT: v_mov_b32_e32 v6, v10 525; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo 526; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo 527; GFX10-NEXT: s_or_b32 s3, s3, s5 528; GFX10-NEXT: s_or_b32 s2, s2, s6 529; GFX10-NEXT: s_waitcnt vmcnt(0) 530; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 531; GFX10-NEXT: global_store_dword v[7:8], v9, off 532; GFX10-NEXT: s_branch .LBB6_1 533; GFX10-NEXT: .LBB6_4: ; %loop.exit.guard 534; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 535; GFX10-NEXT: s_and_saveexec_b32 s0, s1 536; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 537; GFX10-NEXT: s_cbranch_execz .LBB6_6 538; GFX10-NEXT: ; %bb.5: ; %break.body 539; GFX10-NEXT: v_mov_b32_e32 v0, 10 540; GFX10-NEXT: global_store_dword v[4:5], v0, off 541; GFX10-NEXT: .LBB6_6: ; %exit 542; GFX10-NEXT: s_endpgm 543entry: 544 br label %A 545 546A: 547 %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] 548 %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter 549 %a.val = load i32, ptr addrspace(1) %a.plus.counter 550 %a.cond = icmp eq i32 %a.val, 0 551 br i1 %a.cond, label %break.body, label %loop.body 552 553break.body: 554 store i32 10, ptr addrspace(1) %a.break 555 br label %exit 556 557loop.body: 558 %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter 559 %x.val = load i32, ptr addrspace(1) %x.plus.counter 560 %x.val.plus.1 = add i32 %x.val, 1 561 store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter 562 %counter.plus.1 = add i32 %counter, 1 563 %x.cond = icmp ult i32 %counter, 100 564 br i1 %x.cond, label %exit, label %A 565 566exit: 567 ret void 568} 569 570