1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 3 4; Simples case, if - then, that requires lane mask merging, 5; %phi lane mask will hold %val_A at %A. Lanes that are active in %B 6; will overwrite its own lane bit in lane mask with val_B 7define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) { 8; GFX10-LABEL: divergent_i1_phi_if_then: 9; GFX10: ; %bb.0: ; %A 10; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2 11; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 12; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo 13; GFX10-NEXT: ; %bb.1: ; %B 14; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 1, v2 15; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo 16; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo 17; GFX10-NEXT: s_or_b32 s0, s0, s2 18; GFX10-NEXT: ; %bb.2: ; %exit 19; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 20; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0 21; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2 22; GFX10-NEXT: global_store_dword v[0:1], v2, off 23; GFX10-NEXT: s_endpgm 24A: 25 %val_A = icmp uge i32 %tid, 6 26 %cmp = icmp eq i32 %cond, 0 27 br i1 %cmp, label %B, label %exit 28 29B: 30 %val_B = icmp ult i32 %tid, 1 31 br label %exit 32 33exit: 34 %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] 35 %sel = select i1 %phi, i32 1, i32 2 36 store i32 %sel, ptr addrspace(1) %out 37 ret void 38} 39 40; if - else 41define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) { 42; GFX10-LABEL: divergent_i1_phi_if_else: 43; GFX10: ; %bb.0: ; %entry 44; GFX10-NEXT: s_and_b32 s0, 1, s0 45; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 46; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 47; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo 48; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 49; GFX10-NEXT: ; %bb.1: ; %B 50; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 51; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo 52; GFX10-NEXT: ; implicit-def: $vgpr2 53; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo 54; GFX10-NEXT: s_or_b32 s0, s0, s2 55; GFX10-NEXT: ; %bb.2: ; %Flow 56; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 57; GFX10-NEXT: ; %bb.3: ; %A 58; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 59; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo 60; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo 61; GFX10-NEXT: s_or_b32 s0, s0, s2 62; GFX10-NEXT: ; %bb.4: ; %exit 63; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 64; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0 65; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2 66; GFX10-NEXT: global_store_dword v[0:1], v2, off 67; GFX10-NEXT: s_endpgm 68entry: 69 %cmp = icmp eq i32 %cond, 0 70 br i1 %cmp, label %A, label %B 71 72A: 73 %val_A = icmp uge i32 %tid, 1 74 br label %exit 75 76B: 77 %val_B = icmp ult i32 %tid, 2 78 br label %exit 79 80exit: 81 %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] 82 %sel = select i1 %phi, i32 1, i32 2 83 store i32 %sel, ptr addrspace(1) %out 84 ret void 85} 86 87; if - break; 88 89; counter = 0; 90; do { 91; if (a[counter] == 0) 92; break; 93; if (b[counter] == 0) 94; break; 95; if (c[counter] == 0) 96; break; 97; x[counter++]+=1; 98; } while (counter<100); 99 100; Tests with multiple break conditions. Divergent phis will be used to track 101; if any of the break conditions was reached. We only need to do simple lane 102; mask merging (for current loop iteration only). There is an intrinsic, 103; if_break, that will merge lane masks across all iterations of the loop. 104 105define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) { 106; GFX10-LABEL: loop_with_1break: 107; GFX10: ; %bb.0: ; %entry 108; GFX10-NEXT: s_mov_b32 s0, 0 109; GFX10-NEXT: ; implicit-def: $sgpr1 110; GFX10-NEXT: v_mov_b32_e32 v4, s0 111; GFX10-NEXT: s_branch .LBB2_2 112; GFX10-NEXT: .LBB2_1: ; %Flow 113; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 114; GFX10-NEXT: s_waitcnt_depctr 0xffe3 115; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 116; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 117; GFX10-NEXT: s_or_b32 s0, s2, s0 118; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 119; GFX10-NEXT: s_cbranch_execz .LBB2_4 120; GFX10-NEXT: .LBB2_2: ; %A 121; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 122; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 123; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 124; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 125; GFX10-NEXT: s_or_b32 s1, s1, s2 126; GFX10-NEXT: v_lshlrev_b64 v[5:6], 2, v[4:5] 127; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, v5 128; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo 129; GFX10-NEXT: global_load_dword v7, v[7:8], off 130; GFX10-NEXT: s_waitcnt vmcnt(0) 131; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 132; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo 133; GFX10-NEXT: s_cbranch_execz .LBB2_1 134; GFX10-NEXT: ; %bb.3: ; %loop.body 135; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 136; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v0, v5 137; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo 138; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v4 139; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v4 140; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 141; GFX10-NEXT: global_load_dword v7, v[5:6], off 142; GFX10-NEXT: v_mov_b32_e32 v4, v8 143; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo 144; GFX10-NEXT: s_or_b32 s1, s1, s3 145; GFX10-NEXT: s_waitcnt vmcnt(0) 146; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v7 147; GFX10-NEXT: global_store_dword v[5:6], v7, off 148; GFX10-NEXT: s_branch .LBB2_1 149; GFX10-NEXT: .LBB2_4: ; %exit 150; GFX10-NEXT: s_endpgm 151entry: 152 br label %A 153 154A: 155 %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] 156 %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter 157 %a.val = load i32, ptr addrspace(1) %a.plus.counter 158 %a.cond = icmp eq i32 %a.val, 0 159 br i1 %a.cond, label %exit, label %loop.body 160 161loop.body: 162 %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter 163 %x.val = load i32, ptr addrspace(1) %x.plus.counter 164 %x.val.plus.1 = add i32 %x.val, 1 165 store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter 166 %counter.plus.1 = add i32 %counter, 1 167 %x.cond = icmp ult i32 %counter, 100 168 br i1 %x.cond, label %exit, label %A 169 170exit: 171 ret void 172} 173 174define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) { 175; GFX10-LABEL: loop_with_2breaks: 176; GFX10: ; %bb.0: ; %entry 177; GFX10-NEXT: s_mov_b32 s0, 0 178; GFX10-NEXT: ; implicit-def: $sgpr1 179; GFX10-NEXT: v_mov_b32_e32 v6, s0 180; GFX10-NEXT: s_branch .LBB3_3 181; GFX10-NEXT: .LBB3_1: ; %Flow3 182; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 183; GFX10-NEXT: s_waitcnt_depctr 0xffe3 184; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 185; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 186; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 187; GFX10-NEXT: s_or_b32 s1, s1, s3 188; GFX10-NEXT: .LBB3_2: ; %Flow 189; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 190; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 191; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 192; GFX10-NEXT: s_or_b32 s0, s2, s0 193; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 194; GFX10-NEXT: s_cbranch_execz .LBB3_6 195; GFX10-NEXT: .LBB3_3: ; %A 196; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 197; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 198; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 199; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 200; GFX10-NEXT: s_or_b32 s1, s1, s2 201; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] 202; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 203; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo 204; GFX10-NEXT: global_load_dword v9, v[9:10], off 205; GFX10-NEXT: s_waitcnt vmcnt(0) 206; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 207; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo 208; GFX10-NEXT: s_cbranch_execz .LBB3_2 209; GFX10-NEXT: ; %bb.4: ; %B 210; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 211; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 212; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo 213; GFX10-NEXT: s_mov_b32 s4, -1 214; GFX10-NEXT: global_load_dword v9, v[9:10], off 215; GFX10-NEXT: s_waitcnt vmcnt(0) 216; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 217; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo 218; GFX10-NEXT: s_cbranch_execz .LBB3_1 219; GFX10-NEXT: ; %bb.5: ; %loop.body 220; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 221; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 222; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo 223; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 224; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 225; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo 226; GFX10-NEXT: global_load_dword v9, v[7:8], off 227; GFX10-NEXT: v_mov_b32_e32 v6, v10 228; GFX10-NEXT: s_and_b32 s5, exec_lo, vcc_lo 229; GFX10-NEXT: s_or_b32 s4, s4, s5 230; GFX10-NEXT: s_waitcnt vmcnt(0) 231; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 232; GFX10-NEXT: global_store_dword v[7:8], v9, off 233; GFX10-NEXT: s_branch .LBB3_1 234; GFX10-NEXT: .LBB3_6: ; %exit 235; GFX10-NEXT: s_endpgm 236entry: 237 br label %A 238 239A: 240 %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] 241 %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter 242 %a.val = load i32, ptr addrspace(1) %a.plus.counter 243 %a.cond = icmp eq i32 %a.val, 0 244 br i1 %a.cond, label %exit, label %B 245 246B: 247 %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter 248 %b.val = load i32, ptr addrspace(1) %b.plus.counter 249 %b.cond = icmp eq i32 %b.val, 0 250 br i1 %b.cond, label %exit, label %loop.body 251 252loop.body: 253 %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter 254 %x.val = load i32, ptr addrspace(1) %x.plus.counter 255 %x.val.plus.1 = add i32 %x.val, 1 256 store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter 257 %counter.plus.1 = add i32 %counter, 1 258 %x.cond = icmp ult i32 %counter, 100 259 br i1 %x.cond, label %exit, label %A 260 261exit: 262 ret void 263} 264 265define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) { 266; GFX10-LABEL: loop_with_3breaks: 267; GFX10: ; %bb.0: ; %entry 268; GFX10-NEXT: s_mov_b32 s0, 0 269; GFX10-NEXT: ; implicit-def: $sgpr1 270; GFX10-NEXT: v_mov_b32_e32 v8, s0 271; GFX10-NEXT: s_branch .LBB4_4 272; GFX10-NEXT: .LBB4_1: ; %Flow5 273; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 274; GFX10-NEXT: s_waitcnt_depctr 0xffe3 275; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 276; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo 277; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 278; GFX10-NEXT: s_or_b32 s4, s4, s5 279; GFX10-NEXT: .LBB4_2: ; %Flow4 280; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 281; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 282; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 283; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 284; GFX10-NEXT: s_or_b32 s1, s1, s3 285; GFX10-NEXT: .LBB4_3: ; %Flow 286; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 287; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 288; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 289; GFX10-NEXT: s_or_b32 s0, s2, s0 290; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 291; GFX10-NEXT: s_cbranch_execz .LBB4_8 292; GFX10-NEXT: .LBB4_4: ; %A 293; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 294; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8 295; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 296; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 297; GFX10-NEXT: s_or_b32 s1, s1, s2 298; GFX10-NEXT: v_lshlrev_b64 v[9:10], 2, v[8:9] 299; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v2, v9 300; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo 301; GFX10-NEXT: global_load_dword v11, v[11:12], off 302; GFX10-NEXT: s_waitcnt vmcnt(0) 303; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 304; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo 305; GFX10-NEXT: s_cbranch_execz .LBB4_3 306; GFX10-NEXT: ; %bb.5: ; %B 307; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 308; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v4, v9 309; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo 310; GFX10-NEXT: s_mov_b32 s4, -1 311; GFX10-NEXT: global_load_dword v11, v[11:12], off 312; GFX10-NEXT: s_waitcnt vmcnt(0) 313; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 314; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo 315; GFX10-NEXT: s_cbranch_execz .LBB4_2 316; GFX10-NEXT: ; %bb.6: ; %C 317; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 318; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v9 319; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo 320; GFX10-NEXT: s_mov_b32 s5, -1 321; GFX10-NEXT: global_load_dword v11, v[11:12], off 322; GFX10-NEXT: s_waitcnt vmcnt(0) 323; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 324; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 325; GFX10-NEXT: s_cbranch_execz .LBB4_1 326; GFX10-NEXT: ; %bb.7: ; %loop.body 327; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 328; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v0, v9 329; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo 330; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v8 331; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v8 332; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo 333; GFX10-NEXT: global_load_dword v11, v[9:10], off 334; GFX10-NEXT: v_mov_b32_e32 v8, v12 335; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo 336; GFX10-NEXT: s_or_b32 s5, s5, s6 337; GFX10-NEXT: s_waitcnt vmcnt(0) 338; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v11 339; GFX10-NEXT: global_store_dword v[9:10], v11, off 340; GFX10-NEXT: s_branch .LBB4_1 341; GFX10-NEXT: .LBB4_8: ; %exit 342; GFX10-NEXT: s_endpgm 343entry: 344 br label %A 345 346A: 347 %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] 348 %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter 349 %a.val = load i32, ptr addrspace(1) %a.plus.counter 350 %a.cond = icmp eq i32 %a.val, 0 351 br i1 %a.cond, label %exit, label %B 352 353B: 354 %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter 355 %b.val = load i32, ptr addrspace(1) %b.plus.counter 356 %b.cond = icmp eq i32 %b.val, 0 357 br i1 %b.cond, label %exit, label %C 358 359C: 360 %c.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %c, i32 %counter 361 %c.val = load i32, ptr addrspace(1) %c.plus.counter 362 %c.cond = icmp eq i32 %c.val, 0 363 br i1 %c.cond, label %exit, label %loop.body 364 365loop.body: 366 %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter 367 %x.val = load i32, ptr addrspace(1) %x.plus.counter 368 %x.val.plus.1 = add i32 %x.val, 1 369 store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter 370 %counter.plus.1 = add i32 %counter, 1 371 %x.cond = icmp ult i32 %counter, 100 372 br i1 %x.cond, label %exit, label %A 373 374exit: 375 ret void 376} 377 378; Divergent condition if with body, ending with break. This is loop with two 379; exits but structurizer will create phi that will track exit from break 380; and move break.body after the loop. Loop will then have one exit and phi 381; used outside of the loop by condition used to enter the break.body. 382define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) { 383; GFX10-LABEL: loop_with_div_break_with_body: 384; GFX10: ; %bb.0: ; %entry 385; GFX10-NEXT: s_mov_b32 s0, 0 386; GFX10-NEXT: ; implicit-def: $sgpr1 387; GFX10-NEXT: ; implicit-def: $sgpr2 388; GFX10-NEXT: ; implicit-def: $sgpr3 389; GFX10-NEXT: v_mov_b32_e32 v6, s0 390; GFX10-NEXT: s_branch .LBB5_2 391; GFX10-NEXT: .LBB5_1: ; %Flow 392; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 393; GFX10-NEXT: s_waitcnt_depctr 0xffe3 394; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 395; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 396; GFX10-NEXT: s_or_b32 s0, s4, s0 397; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo 398; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 399; GFX10-NEXT: s_or_b32 s1, s1, s4 400; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 401; GFX10-NEXT: s_cbranch_execz .LBB5_4 402; GFX10-NEXT: .LBB5_2: ; %A 403; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 404; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 405; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo 406; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 407; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo 408; GFX10-NEXT: s_or_b32 s3, s3, s4 409; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] 410; GFX10-NEXT: s_or_b32 s2, s2, s4 411; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 412; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo 413; GFX10-NEXT: global_load_dword v9, v[9:10], off 414; GFX10-NEXT: s_waitcnt vmcnt(0) 415; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 416; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 417; GFX10-NEXT: s_cbranch_execz .LBB5_1 418; GFX10-NEXT: ; %bb.3: ; %loop.body 419; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 420; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 421; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo 422; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 423; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 424; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo 425; GFX10-NEXT: global_load_dword v9, v[7:8], off 426; GFX10-NEXT: s_and_b32 s5, exec_lo, 0 427; GFX10-NEXT: v_mov_b32_e32 v6, v10 428; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo 429; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo 430; GFX10-NEXT: s_or_b32 s3, s3, s5 431; GFX10-NEXT: s_or_b32 s2, s2, s6 432; GFX10-NEXT: s_waitcnt vmcnt(0) 433; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 434; GFX10-NEXT: global_store_dword v[7:8], v9, off 435; GFX10-NEXT: s_branch .LBB5_1 436; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard 437; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 438; GFX10-NEXT: s_and_saveexec_b32 s0, s1 439; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 440; GFX10-NEXT: s_cbranch_execz .LBB5_6 441; GFX10-NEXT: ; %bb.5: ; %break.body 442; GFX10-NEXT: v_mov_b32_e32 v0, 10 443; GFX10-NEXT: global_store_dword v[4:5], v0, off 444; GFX10-NEXT: .LBB5_6: ; %exit 445; GFX10-NEXT: s_endpgm 446entry: 447 br label %A 448 449A: 450 %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] 451 %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter 452 %a.val = load i32, ptr addrspace(1) %a.plus.counter 453 %a.cond = icmp eq i32 %a.val, 0 454 br i1 %a.cond, label %break.body, label %loop.body 455 456break.body: 457 store i32 10, ptr addrspace(1) %a.break 458 br label %exit 459 460 461loop.body: 462 %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter 463 %x.val = load i32, ptr addrspace(1) %x.plus.counter 464 %x.val.plus.1 = add i32 %x.val, 1 465 store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter 466 %counter.plus.1 = add i32 %counter, 1 467 %x.cond = icmp ult i32 %counter, 100 468 br i1 %x.cond, label %exit, label %A 469 470exit: 471 ret void 472} 473 474; Snippet from test generated by the GraphicsFuzz tool, frontend generates ir 475; with irreducible control flow graph. FixIrreducible converts it into natural 476; loop and in the process creates i1 phi with three incoming values. 477 478; int loop(int x, int y, int a0, int a1, int a2, int a3, int a4) { 479; do { 480; if (y < a2) { 481; do { 482; } while (x < a2); 483; } 484; if (x < a3) { 485; return a1; 486; } 487; } while (y < a2); 488; return a0; 489; } 490 491; This test is also interesting because it has phi with three incomings 492;define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, i32 %a3) { 493;.entry: 494; %.y_lt_a2 = icmp sgt i32 %a2, %y 495; %.x_lt_a2 = icmp sgt i32 %a2, %x 496; %.x_lt_a3 = icmp sgt i32 %a3, %x 497; br i1 %.y_lt_a2, label %.preheader, label %.loopexit ; first iteration, jump to inner loop if 'y < a2' or start with 'if (x < a3)' 498; 499;.preheader: ; if (y < a2), 500; br label %.inner_loop 501; 502;.inner_loop: ; do while x < a2 503; br i1 %.x_lt_a2, label %.inner_loop, label %.loopexit 504; 505;.loopexit: ; if x < a3 506; %not.inner_loop = xor i1 %.y_lt_a2, true 507; %brmerge = select i1 %.x_lt_a3, i1 true, i1 %not.inner_loop ; exit loop if 'x < a3' or 'loop ends since !(y < a2)' 508; %.ret = select i1 %.x_lt_a3, i32 %a1, i32 %a0 ; select retrun value a1 'x < a3' or a0 'loop ends' 509; br i1 %brmerge, label %.exit, label %.preheader 510; 511;.exit: 512; ret i32 %.ret 513;} 514 515