1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1010 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1100 %s 4 5; Test that unused lanes in the s_xor result are masked out with v_cndmask. 6 7define i32 @combine_add_zext_xor() { 8; GFX1010-LABEL: combine_add_zext_xor: 9; GFX1010: ; %bb.0: ; %.entry 10; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX1010-NEXT: v_mov_b32_e32 v1, 0 12; GFX1010-NEXT: s_branch .LBB0_2 13; GFX1010-NEXT: .LBB0_1: ; %bb9 14; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1 15; GFX1010-NEXT: s_xor_b32 s4, s4, -1 16; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 17; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 18; GFX1010-NEXT: v_add_nc_u32_e32 v2, v1, v0 19; GFX1010-NEXT: v_mov_b32_e32 v1, v2 20; GFX1010-NEXT: s_cbranch_vccz .LBB0_4 21; GFX1010-NEXT: .LBB0_2: ; %.a 22; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 23; GFX1010-NEXT: ; implicit-def: $sgpr4 24; GFX1010-NEXT: s_cbranch_scc1 .LBB0_1 25; GFX1010-NEXT: ; %bb.3: ; %bb 26; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1 27; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc 28; GFX1010-NEXT: s_waitcnt vmcnt(0) 29; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0 30; GFX1010-NEXT: s_branch .LBB0_1 31; GFX1010-NEXT: .LBB0_4: ; %.exit 32; GFX1010-NEXT: s_setpc_b64 s[30:31] 33; 34; GFX1100-LABEL: combine_add_zext_xor: 35; GFX1100: ; %bb.0: ; %.entry 36; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37; GFX1100-NEXT: v_mov_b32_e32 v1, 0 38; GFX1100-NEXT: s_branch .LBB0_2 39; GFX1100-NEXT: .LBB0_1: ; %bb9 40; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 41; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 42; GFX1100-NEXT: s_xor_b32 s0, s0, -1 43; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 44; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 45; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 46; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0 47; GFX1100-NEXT: v_mov_b32_e32 v1, v2 48; GFX1100-NEXT: s_cbranch_vccz .LBB0_4 49; GFX1100-NEXT: .LBB0_2: ; %.a 50; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 51; GFX1100-NEXT: ; implicit-def: $sgpr0 52; GFX1100-NEXT: s_cbranch_scc1 .LBB0_1 53; GFX1100-NEXT: ; %bb.3: ; %bb 54; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 55; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc 56; GFX1100-NEXT: s_waitcnt vmcnt(0) 57; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 58; GFX1100-NEXT: s_branch .LBB0_1 59; GFX1100-NEXT: .LBB0_4: ; %.exit 60; GFX1100-NEXT: s_setpc_b64 s[30:31] 61.entry: 62 br label %.a 63 64.a: ; preds = %bb9, %.entry 65 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] 66 br i1 undef, label %bb9, label %bb 67 68bb: ; preds = %.a 69 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) 70 %i5 = icmp eq i32 %.i3, 0 71 br label %bb9 72 73bb9: ; preds = %bb, %.a 74 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ] 75 %.2.0.in = xor i1 %.2.0.in.in, true 76 %.2.0 = zext i1 %.2.0.in to i32 77 %i11 = add i32 %.2, %.2.0 78 %i12 = icmp sgt i32 %.2, -1050 79 br i1 %i12, label %.a, label %.exit 80 81.exit: ; preds = %bb9 82 ret i32 %.2.0 83} 84 85; Test that unused lanes in the s_xor result are masked out with v_cndmask. 86 87define i32 @combine_sub_zext_xor() { 88; GFX1010-LABEL: combine_sub_zext_xor: 89; GFX1010: ; %bb.0: ; %.entry 90; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 91; GFX1010-NEXT: v_mov_b32_e32 v1, 0 92; GFX1010-NEXT: s_branch .LBB1_2 93; GFX1010-NEXT: .LBB1_1: ; %bb9 94; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1 95; GFX1010-NEXT: s_xor_b32 s4, s4, -1 96; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 97; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 98; GFX1010-NEXT: v_sub_nc_u32_e32 v2, v1, v0 99; GFX1010-NEXT: v_mov_b32_e32 v1, v2 100; GFX1010-NEXT: s_cbranch_vccz .LBB1_4 101; GFX1010-NEXT: .LBB1_2: ; %.a 102; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 103; GFX1010-NEXT: ; implicit-def: $sgpr4 104; GFX1010-NEXT: s_cbranch_scc1 .LBB1_1 105; GFX1010-NEXT: ; %bb.3: ; %bb 106; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1 107; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc 108; GFX1010-NEXT: s_waitcnt vmcnt(0) 109; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0 110; GFX1010-NEXT: s_branch .LBB1_1 111; GFX1010-NEXT: .LBB1_4: ; %.exit 112; GFX1010-NEXT: s_setpc_b64 s[30:31] 113; 114; GFX1100-LABEL: combine_sub_zext_xor: 115; GFX1100: ; %bb.0: ; %.entry 116; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 117; GFX1100-NEXT: v_mov_b32_e32 v1, 0 118; GFX1100-NEXT: s_branch .LBB1_2 119; GFX1100-NEXT: .LBB1_1: ; %bb9 120; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 121; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 122; GFX1100-NEXT: s_xor_b32 s0, s0, -1 123; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 124; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 125; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 126; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0 127; GFX1100-NEXT: v_mov_b32_e32 v1, v2 128; GFX1100-NEXT: s_cbranch_vccz .LBB1_4 129; GFX1100-NEXT: .LBB1_2: ; %.a 130; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 131; GFX1100-NEXT: ; implicit-def: $sgpr0 132; GFX1100-NEXT: s_cbranch_scc1 .LBB1_1 133; GFX1100-NEXT: ; %bb.3: ; %bb 134; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 135; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc 136; GFX1100-NEXT: s_waitcnt vmcnt(0) 137; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 138; GFX1100-NEXT: s_branch .LBB1_1 139; GFX1100-NEXT: .LBB1_4: ; %.exit 140; GFX1100-NEXT: s_setpc_b64 s[30:31] 141.entry: 142 br label %.a 143 144.a: ; preds = %bb9, %.entry 145 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] 146 br i1 undef, label %bb9, label %bb 147 148bb: ; preds = %.a 149 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) 150 %i5 = icmp eq i32 %.i3, 0 151 br label %bb9 152 153bb9: ; preds = %bb, %.a 154 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ] 155 %.2.0.in = xor i1 %.2.0.in.in, true 156 %.2.0 = zext i1 %.2.0.in to i32 157 %i11 = sub i32 %.2, %.2.0 158 %i12 = icmp sgt i32 %.2, -1050 159 br i1 %i12, label %.a, label %.exit 160 161.exit: ; preds = %bb9 162 ret i32 %.2.0 163} 164 165; Test that unused lanes in the s_or result are masked out with v_cndmask. 166 167define i32 @combine_add_zext_or() { 168; GFX1010-LABEL: combine_add_zext_or: 169; GFX1010: ; %bb.0: ; %.entry 170; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 171; GFX1010-NEXT: s_mov_b32 s4, 0 172; GFX1010-NEXT: s_branch .LBB2_2 173; GFX1010-NEXT: .LBB2_1: ; %bb9 174; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1 175; GFX1010-NEXT: s_cmpk_gt_i32 s4, 0xfbe6 176; GFX1010-NEXT: s_cselect_b32 s6, -1, 0 177; GFX1010-NEXT: s_add_i32 s4, s4, 1 178; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s6 179; GFX1010-NEXT: s_cbranch_vccz .LBB2_4 180; GFX1010-NEXT: .LBB2_2: ; %.a 181; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 182; GFX1010-NEXT: ; implicit-def: $sgpr5 183; GFX1010-NEXT: s_cbranch_scc1 .LBB2_1 184; GFX1010-NEXT: ; %bb.3: ; %bb 185; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1 186; GFX1010-NEXT: v_mov_b32_e32 v0, s4 187; GFX1010-NEXT: buffer_load_dword v0, v0, s[4:7], 64 offen glc 188; GFX1010-NEXT: s_waitcnt vmcnt(0) 189; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 190; GFX1010-NEXT: s_branch .LBB2_1 191; GFX1010-NEXT: .LBB2_4: ; %.exit 192; GFX1010-NEXT: s_or_b32 s4, s5, s6 193; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 194; GFX1010-NEXT: s_setpc_b64 s[30:31] 195; 196; GFX1100-LABEL: combine_add_zext_or: 197; GFX1100: ; %bb.0: ; %.entry 198; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 199; GFX1100-NEXT: s_mov_b32 s0, 0 200; GFX1100-NEXT: s_branch .LBB2_2 201; GFX1100-NEXT: .LBB2_1: ; %bb9 202; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1 203; GFX1100-NEXT: s_cmpk_gt_i32 s0, 0xfbe6 204; GFX1100-NEXT: s_cselect_b32 s2, -1, 0 205; GFX1100-NEXT: s_add_i32 s0, s0, 1 206; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s2 207; GFX1100-NEXT: s_cbranch_vccz .LBB2_4 208; GFX1100-NEXT: .LBB2_2: ; %.a 209; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 210; GFX1100-NEXT: ; implicit-def: $sgpr1 211; GFX1100-NEXT: s_cbranch_scc1 .LBB2_1 212; GFX1100-NEXT: ; %bb.3: ; %bb 213; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1 214; GFX1100-NEXT: v_mov_b32_e32 v0, s0 215; GFX1100-NEXT: buffer_load_b32 v0, v0, s[0:3], 64 offen glc 216; GFX1100-NEXT: s_waitcnt vmcnt(0) 217; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0 218; GFX1100-NEXT: s_branch .LBB2_1 219; GFX1100-NEXT: .LBB2_4: ; %.exit 220; GFX1100-NEXT: s_or_b32 s0, s1, s2 221; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 222; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 223; GFX1100-NEXT: s_setpc_b64 s[30:31] 224.entry: 225 br label %.a 226 227.a: ; preds = %bb9, %.entry 228 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] 229 br i1 undef, label %bb9, label %bb 230 231bb: ; preds = %.a 232 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) 233 %i5 = icmp eq i32 %.i3, 0 234 br label %bb9 235 236bb9: ; preds = %bb, %.a 237 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ] 238 %t = icmp sgt i32 %.2, -1050 239 %.2.0.in = or i1 %.2.0.in.in, %t 240 %.2.0 = zext i1 %.2.0.in to i32 241 %i11 = add i32 %.2, %.2.0 242 %i12 = icmp sgt i32 %.2, -1050 243 br i1 %i12, label %.a, label %.exit 244 245.exit: ; preds = %bb9 246 ret i32 %.2.0 247} 248 249; Test that unused lanes in the s_or result are masked out with v_cndmask. 250 251define i32 @combine_sub_zext_or() { 252; GFX1010-LABEL: combine_sub_zext_or: 253; GFX1010: ; %bb.0: ; %.entry 254; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 255; GFX1010-NEXT: s_mov_b32 s4, 0 256; GFX1010-NEXT: s_branch .LBB3_2 257; GFX1010-NEXT: .LBB3_1: ; %bb9 258; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1 259; GFX1010-NEXT: s_cmpk_gt_i32 s4, 0xfbe6 260; GFX1010-NEXT: s_cselect_b32 s6, -1, 0 261; GFX1010-NEXT: s_add_i32 s4, s4, -1 262; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s6 263; GFX1010-NEXT: s_cbranch_vccz .LBB3_4 264; GFX1010-NEXT: .LBB3_2: ; %.a 265; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 266; GFX1010-NEXT: ; implicit-def: $sgpr5 267; GFX1010-NEXT: s_cbranch_scc1 .LBB3_1 268; GFX1010-NEXT: ; %bb.3: ; %bb 269; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1 270; GFX1010-NEXT: v_mov_b32_e32 v0, s4 271; GFX1010-NEXT: buffer_load_dword v0, v0, s[4:7], 64 offen glc 272; GFX1010-NEXT: s_waitcnt vmcnt(0) 273; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 274; GFX1010-NEXT: s_branch .LBB3_1 275; GFX1010-NEXT: .LBB3_4: ; %.exit 276; GFX1010-NEXT: s_or_b32 s4, s5, s6 277; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 278; GFX1010-NEXT: s_setpc_b64 s[30:31] 279; 280; GFX1100-LABEL: combine_sub_zext_or: 281; GFX1100: ; %bb.0: ; %.entry 282; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 283; GFX1100-NEXT: s_mov_b32 s0, 0 284; GFX1100-NEXT: s_branch .LBB3_2 285; GFX1100-NEXT: .LBB3_1: ; %bb9 286; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1 287; GFX1100-NEXT: s_cmpk_gt_i32 s0, 0xfbe6 288; GFX1100-NEXT: s_cselect_b32 s2, -1, 0 289; GFX1100-NEXT: s_add_i32 s0, s0, -1 290; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s2 291; GFX1100-NEXT: s_cbranch_vccz .LBB3_4 292; GFX1100-NEXT: .LBB3_2: ; %.a 293; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 294; GFX1100-NEXT: ; implicit-def: $sgpr1 295; GFX1100-NEXT: s_cbranch_scc1 .LBB3_1 296; GFX1100-NEXT: ; %bb.3: ; %bb 297; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1 298; GFX1100-NEXT: v_mov_b32_e32 v0, s0 299; GFX1100-NEXT: buffer_load_b32 v0, v0, s[0:3], 64 offen glc 300; GFX1100-NEXT: s_waitcnt vmcnt(0) 301; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0 302; GFX1100-NEXT: s_branch .LBB3_1 303; GFX1100-NEXT: .LBB3_4: ; %.exit 304; GFX1100-NEXT: s_or_b32 s0, s1, s2 305; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 306; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 307; GFX1100-NEXT: s_setpc_b64 s[30:31] 308.entry: 309 br label %.a 310 311.a: ; preds = %bb9, %.entry 312 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] 313 br i1 undef, label %bb9, label %bb 314 315bb: ; preds = %.a 316 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) 317 %i5 = icmp eq i32 %.i3, 0 318 br label %bb9 319 320bb9: ; preds = %bb, %.a 321 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ] 322 %t = icmp sgt i32 %.2, -1050 323 %.2.0.in = or i1 %.2.0.in.in, %t 324 %.2.0 = zext i1 %.2.0.in to i32 325 %i11 = sub i32 %.2, %.2.0 326 %i12 = icmp sgt i32 %.2, -1050 327 br i1 %i12, label %.a, label %.exit 328 329.exit: ; preds = %bb9 330 ret i32 %.2.0 331} 332 333; Test that unused lanes in the s_and result are masked out with v_cndmask. 334 335define i32 @combine_add_zext_and() { 336; GFX1010-LABEL: combine_add_zext_and: 337; GFX1010: ; %bb.0: ; %.entry 338; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 339; GFX1010-NEXT: v_mov_b32_e32 v1, 0 340; GFX1010-NEXT: s_branch .LBB4_2 341; GFX1010-NEXT: .LBB4_1: ; %bb9 342; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1 343; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 344; GFX1010-NEXT: s_and_b32 s4, s4, vcc_lo 345; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 346; GFX1010-NEXT: v_add_nc_u32_e32 v1, v1, v0 347; GFX1010-NEXT: s_cbranch_vccz .LBB4_4 348; GFX1010-NEXT: .LBB4_2: ; %.a 349; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 350; GFX1010-NEXT: ; implicit-def: $sgpr4 351; GFX1010-NEXT: s_cbranch_scc1 .LBB4_1 352; GFX1010-NEXT: ; %bb.3: ; %bb 353; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1 354; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc 355; GFX1010-NEXT: s_waitcnt vmcnt(0) 356; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0 357; GFX1010-NEXT: s_branch .LBB4_1 358; GFX1010-NEXT: .LBB4_4: ; %.exit 359; GFX1010-NEXT: s_setpc_b64 s[30:31] 360; 361; GFX1100-LABEL: combine_add_zext_and: 362; GFX1100: ; %bb.0: ; %.entry 363; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; GFX1100-NEXT: v_mov_b32_e32 v1, 0 365; GFX1100-NEXT: s_branch .LBB4_2 366; GFX1100-NEXT: .LBB4_1: ; %bb9 367; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 368; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 369; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 370; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo 371; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 372; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 373; GFX1100-NEXT: v_add_nc_u32_e32 v1, v1, v0 374; GFX1100-NEXT: s_cbranch_vccz .LBB4_4 375; GFX1100-NEXT: .LBB4_2: ; %.a 376; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 377; GFX1100-NEXT: ; implicit-def: $sgpr0 378; GFX1100-NEXT: s_cbranch_scc1 .LBB4_1 379; GFX1100-NEXT: ; %bb.3: ; %bb 380; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 381; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc 382; GFX1100-NEXT: s_waitcnt vmcnt(0) 383; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 384; GFX1100-NEXT: s_branch .LBB4_1 385; GFX1100-NEXT: .LBB4_4: ; %.exit 386; GFX1100-NEXT: s_setpc_b64 s[30:31] 387.entry: 388 br label %.a 389 390.a: ; preds = %bb9, %.entry 391 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] 392 br i1 undef, label %bb9, label %bb 393 394bb: ; preds = %.a 395 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) 396 %i5 = icmp eq i32 %.i3, 0 397 br label %bb9 398 399bb9: ; preds = %bb, %.a 400 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ] 401 %t = icmp sgt i32 %.2, -1050 402 %.2.0.in = and i1 %.2.0.in.in, %t 403 %.2.0 = zext i1 %.2.0.in to i32 404 %i11 = add i32 %.2, %.2.0 405 %i12 = icmp sgt i32 %.2, -1050 406 br i1 %i12, label %.a, label %.exit 407 408.exit: ; preds = %bb9 409 ret i32 %.2.0 410} 411 412; Test that unused lanes in the s_and result are masked out with v_cndmask. 413 414define i32 @combine_sub_zext_and() { 415; GFX1010-LABEL: combine_sub_zext_and: 416; GFX1010: ; %bb.0: ; %.entry 417; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 418; GFX1010-NEXT: v_mov_b32_e32 v1, 0 419; GFX1010-NEXT: s_branch .LBB5_2 420; GFX1010-NEXT: .LBB5_1: ; %bb9 421; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1 422; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 423; GFX1010-NEXT: s_and_b32 s4, s4, vcc_lo 424; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 425; GFX1010-NEXT: v_sub_nc_u32_e32 v1, v1, v0 426; GFX1010-NEXT: s_cbranch_vccz .LBB5_4 427; GFX1010-NEXT: .LBB5_2: ; %.a 428; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 429; GFX1010-NEXT: ; implicit-def: $sgpr4 430; GFX1010-NEXT: s_cbranch_scc1 .LBB5_1 431; GFX1010-NEXT: ; %bb.3: ; %bb 432; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1 433; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc 434; GFX1010-NEXT: s_waitcnt vmcnt(0) 435; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0 436; GFX1010-NEXT: s_branch .LBB5_1 437; GFX1010-NEXT: .LBB5_4: ; %.exit 438; GFX1010-NEXT: s_setpc_b64 s[30:31] 439; 440; GFX1100-LABEL: combine_sub_zext_and: 441; GFX1100: ; %bb.0: ; %.entry 442; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 443; GFX1100-NEXT: v_mov_b32_e32 v1, 0 444; GFX1100-NEXT: s_branch .LBB5_2 445; GFX1100-NEXT: .LBB5_1: ; %bb9 446; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 447; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 448; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 449; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo 450; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 451; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 452; GFX1100-NEXT: v_sub_nc_u32_e32 v1, v1, v0 453; GFX1100-NEXT: s_cbranch_vccz .LBB5_4 454; GFX1100-NEXT: .LBB5_2: ; %.a 455; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 456; GFX1100-NEXT: ; implicit-def: $sgpr0 457; GFX1100-NEXT: s_cbranch_scc1 .LBB5_1 458; GFX1100-NEXT: ; %bb.3: ; %bb 459; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 460; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc 461; GFX1100-NEXT: s_waitcnt vmcnt(0) 462; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 463; GFX1100-NEXT: s_branch .LBB5_1 464; GFX1100-NEXT: .LBB5_4: ; %.exit 465; GFX1100-NEXT: s_setpc_b64 s[30:31] 466.entry: 467 br label %.a 468 469.a: ; preds = %bb9, %.entry 470 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ] 471 br i1 undef, label %bb9, label %bb 472 473bb: ; preds = %.a 474 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1) 475 %i5 = icmp eq i32 %.i3, 0 476 br label %bb9 477 478bb9: ; preds = %bb, %.a 479 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ] 480 %t = icmp sgt i32 %.2, -1050 481 %.2.0.in = and i1 %.2.0.in.in, %t 482 %.2.0 = zext i1 %.2.0.in to i32 483 %i11 = sub i32 %.2, %.2.0 484 %i12 = icmp sgt i32 %.2, -1050 485 br i1 %i12, label %.a, label %.exit 486 487.exit: ; preds = %bb9 488 ret i32 %.2.0 489} 490 491 492; Function Attrs: nounwind readonly willreturn 493declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture, i32, i32, i32 immarg) #0 494 495attributes #0 = { nounwind willreturn memory(argmem: read) } 496 497