1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s 4 5declare i32 @llvm.amdgcn.ballot.i32(i1) 6declare i32 @llvm.ctpop.i32(i32) 7 8; Test ballot(0) 9 10define amdgpu_cs i32 @constant_false() { 11; CHECK-LABEL: constant_false: 12; CHECK: ; %bb.0: 13; CHECK-NEXT: s_mov_b32 s0, 0 14; CHECK-NEXT: ; return to shader part epilog 15 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0) 16 ret i32 %ballot 17} 18 19; Test ballot(1) 20 21define amdgpu_cs i32 @constant_true() { 22; CHECK-LABEL: constant_true: 23; CHECK: ; %bb.0: 24; CHECK-NEXT: s_mov_b32 s0, exec_lo 25; CHECK-NEXT: ; return to shader part epilog 26 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1) 27 ret i32 %ballot 28} 29 30; Test ballot of a non-comparison operation 31 32define amdgpu_cs i32 @non_compare(i32 %x) { 33; CHECK-LABEL: non_compare: 34; CHECK: ; %bb.0: 35; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 36; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 37; CHECK-NEXT: ; return to shader part epilog 38 %trunc = trunc i32 %x to i1 39 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc) 40 ret i32 %ballot 41} 42 43; Test ballot of comparisons 44 45define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) { 46; CHECK-LABEL: compare_ints: 47; CHECK: ; %bb.0: 48; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1 49; CHECK-NEXT: ; return to shader part epilog 50 %cmp = icmp eq i32 %x, %y 51 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) 52 ret i32 %ballot 53} 54 55define amdgpu_cs i32 @compare_int_with_constant(i32 %x) { 56; CHECK-LABEL: compare_int_with_constant: 57; CHECK: ; %bb.0: 58; CHECK-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0 59; CHECK-NEXT: ; return to shader part epilog 60 %cmp = icmp sge i32 %x, 99 61 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) 62 ret i32 %ballot 63} 64 65define amdgpu_cs i32 @compare_floats(float %x, float %y) { 66; CHECK-LABEL: compare_floats: 67; CHECK: ; %bb.0: 68; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 69; CHECK-NEXT: ; return to shader part epilog 70 %cmp = fcmp ogt float %x, %y 71 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) 72 ret i32 %ballot 73} 74 75define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) { 76; CHECK-LABEL: ctpop_of_ballot: 77; CHECK: ; %bb.0: 78; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 79; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo 80; CHECK-NEXT: ; return to shader part epilog 81 %cmp = fcmp ogt float %x, %y 82 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) 83 %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot) 84 ret i32 %bcnt 85} 86 87define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { 88; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare: 89; CHECK: ; %bb.0: 90; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 91; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 92; CHECK-NEXT: s_cbranch_vccz .LBB7_2 93; CHECK-NEXT: ; %bb.1: ; %true 94; CHECK-NEXT: s_mov_b32 s0, 42 95; CHECK-NEXT: s_branch .LBB7_3 96; CHECK-NEXT: .LBB7_2: ; %false 97; CHECK-NEXT: s_mov_b32 s0, 33 98; CHECK-NEXT: s_branch .LBB7_3 99; CHECK-NEXT: .LBB7_3: 100 %c = trunc i32 %v to i1 101 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 102 %ballot_ne_zero = icmp ne i32 %ballot, 0 103 br i1 %ballot_ne_zero, label %true, label %false 104true: 105 ret i32 42 106false: 107 ret i32 33 108} 109 110define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { 111; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: 112; CHECK: ; %bb.0: 113; CHECK-NEXT: s_and_b32 s0, s0, 1 114; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0 115; CHECK-NEXT: s_cbranch_vccz .LBB8_2 116; CHECK-NEXT: ; %bb.1: ; %true 117; CHECK-NEXT: s_mov_b32 s0, 42 118; CHECK-NEXT: s_branch .LBB8_3 119; CHECK-NEXT: .LBB8_2: ; %false 120; CHECK-NEXT: s_mov_b32 s0, 33 121; CHECK-NEXT: s_branch .LBB8_3 122; CHECK-NEXT: .LBB8_3: 123 %c = trunc i32 %v to i1 124 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 125 %ballot_ne_zero = icmp ne i32 %ballot, 0 126 br i1 %ballot_ne_zero, label %true, label %false 127true: 128 ret i32 42 129false: 130 ret i32 33 131} 132 133define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { 134; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare: 135; CHECK: ; %bb.0: 136; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 137; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 138; CHECK-NEXT: s_cbranch_vccz .LBB9_2 139; CHECK-NEXT: ; %bb.1: ; %false 140; CHECK-NEXT: s_mov_b32 s0, 33 141; CHECK-NEXT: s_branch .LBB9_3 142; CHECK-NEXT: .LBB9_2: ; %true 143; CHECK-NEXT: s_mov_b32 s0, 42 144; CHECK-NEXT: s_branch .LBB9_3 145; CHECK-NEXT: .LBB9_3: 146 %c = trunc i32 %v to i1 147 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 148 %ballot_eq_zero = icmp eq i32 %ballot, 0 149 br i1 %ballot_eq_zero, label %true, label %false 150true: 151 ret i32 42 152false: 153 ret i32 33 154} 155 156define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { 157; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: 158; CHECK: ; %bb.0: 159; CHECK-NEXT: s_and_b32 s0, s0, 1 160; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0 161; CHECK-NEXT: s_cbranch_vccz .LBB10_2 162; CHECK-NEXT: ; %bb.1: ; %false 163; CHECK-NEXT: s_mov_b32 s0, 33 164; CHECK-NEXT: s_branch .LBB10_3 165; CHECK-NEXT: .LBB10_2: ; %true 166; CHECK-NEXT: s_mov_b32 s0, 42 167; CHECK-NEXT: s_branch .LBB10_3 168; CHECK-NEXT: .LBB10_3: 169 %c = trunc i32 %v to i1 170 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 171 %ballot_eq_zero = icmp eq i32 %ballot, 0 172 br i1 %ballot_eq_zero, label %true, label %false 173true: 174 ret i32 42 175false: 176 ret i32 33 177} 178 179define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) { 180; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare: 181; CHECK: ; %bb.0: 182; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 183; CHECK-NEXT: s_cbranch_vccz .LBB11_2 184; CHECK-NEXT: ; %bb.1: ; %true 185; CHECK-NEXT: s_mov_b32 s0, 42 186; CHECK-NEXT: s_branch .LBB11_3 187; CHECK-NEXT: .LBB11_2: ; %false 188; CHECK-NEXT: s_mov_b32 s0, 33 189; CHECK-NEXT: s_branch .LBB11_3 190; CHECK-NEXT: .LBB11_3: 191 %c = icmp ult i32 %v, 12 192 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 193 %ballot_ne_zero = icmp ne i32 %ballot, 0 194 br i1 %ballot_ne_zero, label %true, label %false 195true: 196 ret i32 42 197false: 198 ret i32 33 199} 200 201define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { 202; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: 203; CHECK: ; %bb.0: 204; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12 205; CHECK-NEXT: s_cbranch_vccz .LBB12_2 206; CHECK-NEXT: ; %bb.1: ; %true 207; CHECK-NEXT: s_mov_b32 s0, 42 208; CHECK-NEXT: s_branch .LBB12_3 209; CHECK-NEXT: .LBB12_2: ; %false 210; CHECK-NEXT: s_mov_b32 s0, 33 211; CHECK-NEXT: s_branch .LBB12_3 212; CHECK-NEXT: .LBB12_3: 213 %c = icmp ult i32 %v, 12 214 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 215 %ballot_ne_zero = icmp ne i32 %ballot, 0 216 br i1 %ballot_ne_zero, label %true, label %false 217true: 218 ret i32 42 219false: 220 ret i32 33 221} 222 223define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) { 224; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare: 225; CHECK: ; %bb.0: 226; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 227; CHECK-NEXT: s_cbranch_vccz .LBB13_2 228; CHECK-NEXT: ; %bb.1: ; %false 229; CHECK-NEXT: s_mov_b32 s0, 33 230; CHECK-NEXT: s_branch .LBB13_3 231; CHECK-NEXT: .LBB13_2: ; %true 232; CHECK-NEXT: s_mov_b32 s0, 42 233; CHECK-NEXT: s_branch .LBB13_3 234; CHECK-NEXT: .LBB13_3: 235 %c = icmp ult i32 %v, 12 236 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 237 %ballot_eq_zero = icmp eq i32 %ballot, 0 238 br i1 %ballot_eq_zero, label %true, label %false 239true: 240 ret i32 42 241false: 242 ret i32 33 243} 244 245define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { 246; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: 247; CHECK: ; %bb.0: 248; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12 249; CHECK-NEXT: s_cbranch_vccz .LBB14_2 250; CHECK-NEXT: ; %bb.1: ; %false 251; CHECK-NEXT: s_mov_b32 s0, 33 252; CHECK-NEXT: s_branch .LBB14_3 253; CHECK-NEXT: .LBB14_2: ; %true 254; CHECK-NEXT: s_mov_b32 s0, 42 255; CHECK-NEXT: s_branch .LBB14_3 256; CHECK-NEXT: .LBB14_3: 257 %c = icmp ult i32 %v, 12 258 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 259 %ballot_eq_zero = icmp eq i32 %ballot, 0 260 br i1 %ballot_eq_zero, label %true, label %false 261true: 262 ret i32 42 263false: 264 ret i32 33 265} 266 267define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) { 268; CHECK-LABEL: branch_divergent_ballot_ne_zero_and: 269; CHECK: ; %bb.0: 270; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 271; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 272; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 273; CHECK-NEXT: s_cbranch_vccz .LBB15_2 274; CHECK-NEXT: ; %bb.1: ; %true 275; CHECK-NEXT: s_mov_b32 s0, 42 276; CHECK-NEXT: s_branch .LBB15_3 277; CHECK-NEXT: .LBB15_2: ; %false 278; CHECK-NEXT: s_mov_b32 s0, 33 279; CHECK-NEXT: s_branch .LBB15_3 280; CHECK-NEXT: .LBB15_3: 281 %v1c = icmp ult i32 %v1, 12 282 %v2c = icmp ugt i32 %v2, 34 283 %c = and i1 %v1c, %v2c 284 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 285 %ballot_ne_zero = icmp ne i32 %ballot, 0 286 br i1 %ballot_ne_zero, label %true, label %false 287true: 288 ret i32 42 289false: 290 ret i32 33 291} 292 293define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { 294; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: 295; CHECK: ; %bb.0: 296; CHECK-NEXT: s_cmp_lt_u32 s0, 12 297; CHECK-NEXT: s_cselect_b32 s0, -1, 0 298; CHECK-NEXT: s_cmp_gt_u32 s1, 34 299; CHECK-NEXT: s_cselect_b32 s1, -1, 0 300; CHECK-NEXT: s_and_b32 s0, s0, s1 301; CHECK-NEXT: s_and_b32 s0, s0, exec_lo 302; CHECK-NEXT: s_cbranch_scc0 .LBB16_2 303; CHECK-NEXT: ; %bb.1: ; %true 304; CHECK-NEXT: s_mov_b32 s0, 42 305; CHECK-NEXT: s_branch .LBB16_3 306; CHECK-NEXT: .LBB16_2: ; %false 307; CHECK-NEXT: s_mov_b32 s0, 33 308; CHECK-NEXT: s_branch .LBB16_3 309; CHECK-NEXT: .LBB16_3: 310 %v1c = icmp ult i32 %v1, 12 311 %v2c = icmp ugt i32 %v2, 34 312 %c = and i1 %v1c, %v2c 313 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 314 %ballot_ne_zero = icmp ne i32 %ballot, 0 315 br i1 %ballot_ne_zero, label %true, label %false 316true: 317 ret i32 42 318false: 319 ret i32 33 320} 321 322define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { 323; CHECK-LABEL: branch_divergent_ballot_eq_zero_and: 324; CHECK: ; %bb.0: 325; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 326; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 327; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 328; CHECK-NEXT: s_cbranch_vccz .LBB17_2 329; CHECK-NEXT: ; %bb.1: ; %false 330; CHECK-NEXT: s_mov_b32 s0, 33 331; CHECK-NEXT: s_branch .LBB17_3 332; CHECK-NEXT: .LBB17_2: ; %true 333; CHECK-NEXT: s_mov_b32 s0, 42 334; CHECK-NEXT: s_branch .LBB17_3 335; CHECK-NEXT: .LBB17_3: 336 %v1c = icmp ult i32 %v1, 12 337 %v2c = icmp ugt i32 %v2, 34 338 %c = and i1 %v1c, %v2c 339 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 340 %ballot_eq_zero = icmp eq i32 %ballot, 0 341 br i1 %ballot_eq_zero, label %true, label %false 342true: 343 ret i32 42 344false: 345 ret i32 33 346} 347 348define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) { 349; CHECK-LABEL: branch_uniform_ballot_eq_zero_and: 350; CHECK: ; %bb.0: 351; CHECK-NEXT: s_cmp_lt_u32 s0, 12 352; CHECK-NEXT: s_cselect_b32 s0, -1, 0 353; CHECK-NEXT: s_cmp_gt_u32 s1, 34 354; CHECK-NEXT: s_cselect_b32 s1, -1, 0 355; CHECK-NEXT: s_and_b32 s0, s0, s1 356; CHECK-NEXT: s_and_b32 s0, s0, exec_lo 357; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 358; CHECK-NEXT: ; %bb.1: ; %false 359; CHECK-NEXT: s_mov_b32 s0, 33 360; CHECK-NEXT: s_branch .LBB18_3 361; CHECK-NEXT: .LBB18_2: ; %true 362; CHECK-NEXT: s_mov_b32 s0, 42 363; CHECK-NEXT: s_branch .LBB18_3 364; CHECK-NEXT: .LBB18_3: 365 %v1c = icmp ult i32 %v1, 12 366 %v2c = icmp ugt i32 %v2, 34 367 %c = and i1 %v1c, %v2c 368 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 369 %ballot_eq_zero = icmp eq i32 %ballot, 0 370 br i1 %ballot_eq_zero, label %true, label %false 371true: 372 ret i32 42 373false: 374 ret i32 33 375} 376 377define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) { 378; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare: 379; CHECK: ; %bb.0: 380; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12 381; CHECK-NEXT: s_cmp_lt_i32 s0, 23 382; CHECK-NEXT: s_cbranch_scc1 .LBB19_2 383; CHECK-NEXT: ; %bb.1: ; %true 384; CHECK-NEXT: s_mov_b32 s0, 42 385; CHECK-NEXT: s_branch .LBB19_3 386; CHECK-NEXT: .LBB19_2: ; %false 387; CHECK-NEXT: s_mov_b32 s0, 33 388; CHECK-NEXT: s_branch .LBB19_3 389; CHECK-NEXT: .LBB19_3: 390 %c = icmp ult i32 %v, 12 391 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) 392 %bc = icmp sgt i32 %ballot, 22 393 br i1 %bc, label %true, label %false 394true: 395 ret i32 42 396false: 397 ret i32 33 398} 399 400declare i32 @llvm.amdgcn.icmp.i32(i1, i1, i32) 401 402define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) { 403; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and: 404; CHECK: ; %bb.0: 405; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 406; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 407; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 408; CHECK-NEXT: s_cbranch_vccnz .LBB20_2 409; CHECK-NEXT: ; %bb.1: ; %true 410; CHECK-NEXT: s_mov_b32 s0, 42 411; CHECK-NEXT: s_branch .LBB20_3 412; CHECK-NEXT: .LBB20_2: ; %false 413; CHECK-NEXT: s_mov_b32 s0, 33 414; CHECK-NEXT: s_branch .LBB20_3 415; CHECK-NEXT: .LBB20_3: 416 %v1c = icmp ult i32 %v1, 12 417 %v2c = icmp ugt i32 %v2, 34 418 %c = and i1 %v1c, %v2c 419 %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32 420 %ballot_ne_zero = icmp ne i32 %ballot, 0 421 br i1 %ballot_ne_zero, label %true, label %false 422true: 423 ret i32 42 424false: 425 ret i32 33 426} 427 428define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { 429; TODO: 430; s_cmp_lt_u32 s0, 12 431; s_cselect_b32 s0, -1, 0 432; s_cmp_gt_u32 s1, 34 433; s_cselect_b32 s1, -1, 0 434; s_and_b32 s0, s0, s1 435; s_and_b32 s0, s0, exec_lo 436; could be improved to: 437; s_cmp_lt_u32 s0, 12 438; s_cselect_b32 s0, -1, 0 439; s_cmp_gt_u32 s1, 34 440; s_cselect_b32 s0, s0, 0 441; s_and_b32 s0, s0, exec_lo 442; By selecting into vcc(_lo) instead, we could even avoid the AND-with-exec. 443; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and: 444; CHECK: ; %bb.0: 445; CHECK-NEXT: s_cmp_lt_u32 s0, 12 446; CHECK-NEXT: s_cselect_b32 s0, -1, 0 447; CHECK-NEXT: s_cmp_gt_u32 s1, 34 448; CHECK-NEXT: s_cselect_b32 s1, -1, 0 449; CHECK-NEXT: s_and_b32 s0, s0, s1 450; CHECK-NEXT: s_and_b32 s0, s0, exec_lo 451; CHECK-NEXT: s_cbranch_scc1 .LBB21_2 452; CHECK-NEXT: ; %bb.1: ; %true 453; CHECK-NEXT: s_mov_b32 s0, 42 454; CHECK-NEXT: s_branch .LBB21_3 455; CHECK-NEXT: .LBB21_2: ; %false 456; CHECK-NEXT: s_mov_b32 s0, 33 457; CHECK-NEXT: s_branch .LBB21_3 458; CHECK-NEXT: .LBB21_3: 459 %v1c = icmp ult i32 %v1, 12 460 %v2c = icmp ugt i32 %v2, 34 461 %c = and i1 %v1c, %v2c 462 %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32 463 %ballot_ne_zero = icmp ne i32 %ballot, 0 464 br i1 %ballot_ne_zero, label %true, label %false 465true: 466 ret i32 42 467false: 468 ret i32 33 469} 470 471define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) { 472; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and: 473; CHECK: ; %bb.0: 474; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 475; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 476; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 477; CHECK-NEXT: s_cbranch_vccnz .LBB22_2 478; CHECK-NEXT: ; %bb.1: ; %false 479; CHECK-NEXT: s_mov_b32 s0, 33 480; CHECK-NEXT: s_branch .LBB22_3 481; CHECK-NEXT: .LBB22_2: ; %true 482; CHECK-NEXT: s_mov_b32 s0, 42 483; CHECK-NEXT: s_branch .LBB22_3 484; CHECK-NEXT: .LBB22_3: 485 %v1c = icmp ult i32 %v1, 12 486 %v2c = icmp ugt i32 %v2, 34 487 %c = and i1 %v1c, %v2c 488 %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32 489 %ballot_eq_zero = icmp eq i32 %ballot, 0 490 br i1 %ballot_eq_zero, label %true, label %false 491true: 492 ret i32 42 493false: 494 ret i32 33 495} 496 497define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) { 498; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and: 499; CHECK: ; %bb.0: 500; CHECK-NEXT: s_cmp_lt_u32 s0, 12 501; CHECK-NEXT: s_cselect_b32 s0, -1, 0 502; CHECK-NEXT: s_cmp_gt_u32 s1, 34 503; CHECK-NEXT: s_cselect_b32 s1, -1, 0 504; CHECK-NEXT: s_and_b32 s0, s0, s1 505; CHECK-NEXT: s_and_b32 s0, s0, exec_lo 506; CHECK-NEXT: s_cbranch_scc1 .LBB23_2 507; CHECK-NEXT: ; %bb.1: ; %false 508; CHECK-NEXT: s_mov_b32 s0, 33 509; CHECK-NEXT: s_branch .LBB23_3 510; CHECK-NEXT: .LBB23_2: ; %true 511; CHECK-NEXT: s_mov_b32 s0, 42 512; CHECK-NEXT: s_branch .LBB23_3 513; CHECK-NEXT: .LBB23_3: 514 %v1c = icmp ult i32 %v1, 12 515 %v2c = icmp ugt i32 %v2, 34 516 %c = and i1 %v1c, %v2c 517 %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32 518 %ballot_eq_zero = icmp eq i32 %ballot, 0 519 br i1 %ballot_eq_zero, label %true, label %false 520true: 521 ret i32 42 522false: 523 ret i32 33 524} 525 526; Input that is not constant or direct result of a compare. 527; Tests setting 0 to inactive lanes. 528define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { 529; GFX10-LABEL: non_cst_non_compare_input: 530; GFX10: ; %bb.0: ; %entry 531; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 532; GFX10-NEXT: ; implicit-def: $sgpr0 533; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo 534; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 535; GFX10-NEXT: ; %bb.1: ; %B 536; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 537; GFX10-NEXT: ; implicit-def: $vgpr2 538; GFX10-NEXT: s_and_b32 s0, vcc_lo, exec_lo 539; GFX10-NEXT: ; %bb.2: ; %Flow 540; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 541; GFX10-NEXT: ; %bb.3: ; %A 542; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 543; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo 544; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo 545; GFX10-NEXT: s_or_b32 s0, s0, s2 546; GFX10-NEXT: ; %bb.4: ; %exit 547; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 548; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 549; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 550; GFX10-NEXT: v_mov_b32_e32 v2, s0 551; GFX10-NEXT: global_store_dword v[0:1], v2, off 552; GFX10-NEXT: s_endpgm 553; 554; GFX11-LABEL: non_cst_non_compare_input: 555; GFX11: ; %bb.0: ; %entry 556; GFX11-NEXT: s_mov_b32 s1, exec_lo 557; GFX11-NEXT: ; implicit-def: $sgpr0 558; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 559; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 560; GFX11-NEXT: ; %bb.1: ; %B 561; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 562; GFX11-NEXT: ; implicit-def: $vgpr2 563; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo 564; GFX11-NEXT: ; %bb.2: ; %Flow 565; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1 566; GFX11-NEXT: ; %bb.3: ; %A 567; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 568; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo 569; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo 570; GFX11-NEXT: s_or_b32 s0, s0, s2 571; GFX11-NEXT: ; %bb.4: ; %exit 572; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 573; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 574; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 575; GFX11-NEXT: v_mov_b32_e32 v2, s0 576; GFX11-NEXT: global_store_b32 v[0:1], v2, off 577; GFX11-NEXT: s_endpgm 578entry: 579 %cmp = icmp eq i32 %cond, 0 580 br i1 %cmp, label %A, label %B 581 582A: 583 %val_A = icmp uge i32 %tid, 1 584 br label %exit 585 586B: 587 %val_B = icmp ult i32 %tid, 2 588 br label %exit 589 590exit: 591 %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] 592 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi) 593 store i32 %ballot, ptr addrspace(1) %out 594 ret void 595} 596