xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
4
5declare i32 @llvm.amdgcn.ballot.i32(i1)
6declare i32 @llvm.ctpop.i32(i32)
7
8; Test ballot(0)
9
10define amdgpu_cs i32 @constant_false() {
11; CHECK-LABEL: constant_false:
12; CHECK:       ; %bb.0:
13; CHECK-NEXT:    s_mov_b32 s0, 0
14; CHECK-NEXT:    ; return to shader part epilog
15  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0)
16  ret i32 %ballot
17}
18
19; Test ballot(1)
20
21define amdgpu_cs i32 @constant_true() {
22; CHECK-LABEL: constant_true:
23; CHECK:       ; %bb.0:
24; CHECK-NEXT:    s_mov_b32 s0, exec_lo
25; CHECK-NEXT:    ; return to shader part epilog
26  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1)
27  ret i32 %ballot
28}
29
30; Test ballot of a non-comparison operation
31
32define amdgpu_cs i32 @non_compare(i32 %x) {
33; CHECK-LABEL: non_compare:
34; CHECK:       ; %bb.0:
35; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
36; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
37; CHECK-NEXT:    ; return to shader part epilog
38  %trunc = trunc i32 %x to i1
39  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
40  ret i32 %ballot
41}
42
43; Test ballot of comparisons
44
45define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
46; CHECK-LABEL: compare_ints:
47; CHECK:       ; %bb.0:
48; CHECK-NEXT:    v_cmp_eq_u32_e64 s0, v0, v1
49; CHECK-NEXT:    ; return to shader part epilog
50  %cmp = icmp eq i32 %x, %y
51  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
52  ret i32 %ballot
53}
54
55define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
56; CHECK-LABEL: compare_int_with_constant:
57; CHECK:       ; %bb.0:
58; CHECK-NEXT:    v_cmp_lt_i32_e64 s0, 0x62, v0
59; CHECK-NEXT:    ; return to shader part epilog
60  %cmp = icmp sge i32 %x, 99
61  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
62  ret i32 %ballot
63}
64
65define amdgpu_cs i32 @compare_floats(float %x, float %y) {
66; CHECK-LABEL: compare_floats:
67; CHECK:       ; %bb.0:
68; CHECK-NEXT:    v_cmp_gt_f32_e64 s0, v0, v1
69; CHECK-NEXT:    ; return to shader part epilog
70  %cmp = fcmp ogt float %x, %y
71  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
72  ret i32 %ballot
73}
74
75define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
76; CHECK-LABEL: ctpop_of_ballot:
77; CHECK:       ; %bb.0:
78; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
79; CHECK-NEXT:    s_bcnt1_i32_b32 s0, vcc_lo
80; CHECK-NEXT:    ; return to shader part epilog
81  %cmp = fcmp ogt float %x, %y
82  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
83  %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
84  ret i32 %bcnt
85}
86
87define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
88; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
89; CHECK:       ; %bb.0:
90; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
91; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
92; CHECK-NEXT:    s_cbranch_vccz .LBB7_2
93; CHECK-NEXT:  ; %bb.1: ; %true
94; CHECK-NEXT:    s_mov_b32 s0, 42
95; CHECK-NEXT:    s_branch .LBB7_3
96; CHECK-NEXT:  .LBB7_2: ; %false
97; CHECK-NEXT:    s_mov_b32 s0, 33
98; CHECK-NEXT:    s_branch .LBB7_3
99; CHECK-NEXT:  .LBB7_3:
100  %c = trunc i32 %v to i1
101  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
102  %ballot_ne_zero = icmp ne i32 %ballot, 0
103  br i1 %ballot_ne_zero, label %true, label %false
104true:
105  ret i32 42
106false:
107  ret i32 33
108}
109
110define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
111; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
112; CHECK:       ; %bb.0:
113; CHECK-NEXT:    s_and_b32 s0, s0, 1
114; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc_lo, s0, 0
115; CHECK-NEXT:    s_cbranch_vccz .LBB8_2
116; CHECK-NEXT:  ; %bb.1: ; %true
117; CHECK-NEXT:    s_mov_b32 s0, 42
118; CHECK-NEXT:    s_branch .LBB8_3
119; CHECK-NEXT:  .LBB8_2: ; %false
120; CHECK-NEXT:    s_mov_b32 s0, 33
121; CHECK-NEXT:    s_branch .LBB8_3
122; CHECK-NEXT:  .LBB8_3:
123  %c = trunc i32 %v to i1
124  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
125  %ballot_ne_zero = icmp ne i32 %ballot, 0
126  br i1 %ballot_ne_zero, label %true, label %false
127true:
128  ret i32 42
129false:
130  ret i32 33
131}
132
133define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
134; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
135; CHECK:       ; %bb.0:
136; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
137; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
138; CHECK-NEXT:    s_cbranch_vccz .LBB9_2
139; CHECK-NEXT:  ; %bb.1: ; %false
140; CHECK-NEXT:    s_mov_b32 s0, 33
141; CHECK-NEXT:    s_branch .LBB9_3
142; CHECK-NEXT:  .LBB9_2: ; %true
143; CHECK-NEXT:    s_mov_b32 s0, 42
144; CHECK-NEXT:    s_branch .LBB9_3
145; CHECK-NEXT:  .LBB9_3:
146  %c = trunc i32 %v to i1
147  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
148  %ballot_eq_zero = icmp eq i32 %ballot, 0
149  br i1 %ballot_eq_zero, label %true, label %false
150true:
151  ret i32 42
152false:
153  ret i32 33
154}
155
156define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
157; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
158; CHECK:       ; %bb.0:
159; CHECK-NEXT:    s_and_b32 s0, s0, 1
160; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc_lo, s0, 0
161; CHECK-NEXT:    s_cbranch_vccz .LBB10_2
162; CHECK-NEXT:  ; %bb.1: ; %false
163; CHECK-NEXT:    s_mov_b32 s0, 33
164; CHECK-NEXT:    s_branch .LBB10_3
165; CHECK-NEXT:  .LBB10_2: ; %true
166; CHECK-NEXT:    s_mov_b32 s0, 42
167; CHECK-NEXT:    s_branch .LBB10_3
168; CHECK-NEXT:  .LBB10_3:
169  %c = trunc i32 %v to i1
170  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
171  %ballot_eq_zero = icmp eq i32 %ballot, 0
172  br i1 %ballot_eq_zero, label %true, label %false
173true:
174  ret i32 42
175false:
176  ret i32 33
177}
178
179define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
180; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
181; CHECK:       ; %bb.0:
182; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
183; CHECK-NEXT:    s_cbranch_vccz .LBB11_2
184; CHECK-NEXT:  ; %bb.1: ; %true
185; CHECK-NEXT:    s_mov_b32 s0, 42
186; CHECK-NEXT:    s_branch .LBB11_3
187; CHECK-NEXT:  .LBB11_2: ; %false
188; CHECK-NEXT:    s_mov_b32 s0, 33
189; CHECK-NEXT:    s_branch .LBB11_3
190; CHECK-NEXT:  .LBB11_3:
191  %c = icmp ult i32 %v, 12
192  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
193  %ballot_ne_zero = icmp ne i32 %ballot, 0
194  br i1 %ballot_ne_zero, label %true, label %false
195true:
196  ret i32 42
197false:
198  ret i32 33
199}
200
201define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
202; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
203; CHECK:       ; %bb.0:
204; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc_lo, s0, 12
205; CHECK-NEXT:    s_cbranch_vccz .LBB12_2
206; CHECK-NEXT:  ; %bb.1: ; %true
207; CHECK-NEXT:    s_mov_b32 s0, 42
208; CHECK-NEXT:    s_branch .LBB12_3
209; CHECK-NEXT:  .LBB12_2: ; %false
210; CHECK-NEXT:    s_mov_b32 s0, 33
211; CHECK-NEXT:    s_branch .LBB12_3
212; CHECK-NEXT:  .LBB12_3:
213  %c = icmp ult i32 %v, 12
214  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
215  %ballot_ne_zero = icmp ne i32 %ballot, 0
216  br i1 %ballot_ne_zero, label %true, label %false
217true:
218  ret i32 42
219false:
220  ret i32 33
221}
222
223define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
224; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
225; CHECK:       ; %bb.0:
226; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
227; CHECK-NEXT:    s_cbranch_vccz .LBB13_2
228; CHECK-NEXT:  ; %bb.1: ; %false
229; CHECK-NEXT:    s_mov_b32 s0, 33
230; CHECK-NEXT:    s_branch .LBB13_3
231; CHECK-NEXT:  .LBB13_2: ; %true
232; CHECK-NEXT:    s_mov_b32 s0, 42
233; CHECK-NEXT:    s_branch .LBB13_3
234; CHECK-NEXT:  .LBB13_3:
235  %c = icmp ult i32 %v, 12
236  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
237  %ballot_eq_zero = icmp eq i32 %ballot, 0
238  br i1 %ballot_eq_zero, label %true, label %false
239true:
240  ret i32 42
241false:
242  ret i32 33
243}
244
245define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
246; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
247; CHECK:       ; %bb.0:
248; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc_lo, s0, 12
249; CHECK-NEXT:    s_cbranch_vccz .LBB14_2
250; CHECK-NEXT:  ; %bb.1: ; %false
251; CHECK-NEXT:    s_mov_b32 s0, 33
252; CHECK-NEXT:    s_branch .LBB14_3
253; CHECK-NEXT:  .LBB14_2: ; %true
254; CHECK-NEXT:    s_mov_b32 s0, 42
255; CHECK-NEXT:    s_branch .LBB14_3
256; CHECK-NEXT:  .LBB14_3:
257  %c = icmp ult i32 %v, 12
258  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
259  %ballot_eq_zero = icmp eq i32 %ballot, 0
260  br i1 %ballot_eq_zero, label %true, label %false
261true:
262  ret i32 42
263false:
264  ret i32 33
265}
266
267define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
268; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
269; CHECK:       ; %bb.0:
270; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
271; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
272; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
273; CHECK-NEXT:    s_cbranch_vccz .LBB15_2
274; CHECK-NEXT:  ; %bb.1: ; %true
275; CHECK-NEXT:    s_mov_b32 s0, 42
276; CHECK-NEXT:    s_branch .LBB15_3
277; CHECK-NEXT:  .LBB15_2: ; %false
278; CHECK-NEXT:    s_mov_b32 s0, 33
279; CHECK-NEXT:    s_branch .LBB15_3
280; CHECK-NEXT:  .LBB15_3:
281  %v1c = icmp ult i32 %v1, 12
282  %v2c = icmp ugt i32 %v2, 34
283  %c = and i1 %v1c, %v2c
284  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
285  %ballot_ne_zero = icmp ne i32 %ballot, 0
286  br i1 %ballot_ne_zero, label %true, label %false
287true:
288  ret i32 42
289false:
290  ret i32 33
291}
292
293define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
294; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
295; CHECK:       ; %bb.0:
296; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
297; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
298; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
299; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
300; CHECK-NEXT:    s_and_b32 s0, s0, s1
301; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
302; CHECK-NEXT:    s_cbranch_scc0 .LBB16_2
303; CHECK-NEXT:  ; %bb.1: ; %true
304; CHECK-NEXT:    s_mov_b32 s0, 42
305; CHECK-NEXT:    s_branch .LBB16_3
306; CHECK-NEXT:  .LBB16_2: ; %false
307; CHECK-NEXT:    s_mov_b32 s0, 33
308; CHECK-NEXT:    s_branch .LBB16_3
309; CHECK-NEXT:  .LBB16_3:
310  %v1c = icmp ult i32 %v1, 12
311  %v2c = icmp ugt i32 %v2, 34
312  %c = and i1 %v1c, %v2c
313  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
314  %ballot_ne_zero = icmp ne i32 %ballot, 0
315  br i1 %ballot_ne_zero, label %true, label %false
316true:
317  ret i32 42
318false:
319  ret i32 33
320}
321
322define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
323; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
324; CHECK:       ; %bb.0:
325; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
326; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
327; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
328; CHECK-NEXT:    s_cbranch_vccz .LBB17_2
329; CHECK-NEXT:  ; %bb.1: ; %false
330; CHECK-NEXT:    s_mov_b32 s0, 33
331; CHECK-NEXT:    s_branch .LBB17_3
332; CHECK-NEXT:  .LBB17_2: ; %true
333; CHECK-NEXT:    s_mov_b32 s0, 42
334; CHECK-NEXT:    s_branch .LBB17_3
335; CHECK-NEXT:  .LBB17_3:
336  %v1c = icmp ult i32 %v1, 12
337  %v2c = icmp ugt i32 %v2, 34
338  %c = and i1 %v1c, %v2c
339  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
340  %ballot_eq_zero = icmp eq i32 %ballot, 0
341  br i1 %ballot_eq_zero, label %true, label %false
342true:
343  ret i32 42
344false:
345  ret i32 33
346}
347
348define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
349; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
350; CHECK:       ; %bb.0:
351; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
352; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
353; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
354; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
355; CHECK-NEXT:    s_and_b32 s0, s0, s1
356; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
357; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
358; CHECK-NEXT:  ; %bb.1: ; %false
359; CHECK-NEXT:    s_mov_b32 s0, 33
360; CHECK-NEXT:    s_branch .LBB18_3
361; CHECK-NEXT:  .LBB18_2: ; %true
362; CHECK-NEXT:    s_mov_b32 s0, 42
363; CHECK-NEXT:    s_branch .LBB18_3
364; CHECK-NEXT:  .LBB18_3:
365  %v1c = icmp ult i32 %v1, 12
366  %v2c = icmp ugt i32 %v2, 34
367  %c = and i1 %v1c, %v2c
368  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
369  %ballot_eq_zero = icmp eq i32 %ballot, 0
370  br i1 %ballot_eq_zero, label %true, label %false
371true:
372  ret i32 42
373false:
374  ret i32 33
375}
376
377define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
378; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
379; CHECK:       ; %bb.0:
380; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, s0, 12
381; CHECK-NEXT:    s_cmp_lt_i32 s0, 23
382; CHECK-NEXT:    s_cbranch_scc1 .LBB19_2
383; CHECK-NEXT:  ; %bb.1: ; %true
384; CHECK-NEXT:    s_mov_b32 s0, 42
385; CHECK-NEXT:    s_branch .LBB19_3
386; CHECK-NEXT:  .LBB19_2: ; %false
387; CHECK-NEXT:    s_mov_b32 s0, 33
388; CHECK-NEXT:    s_branch .LBB19_3
389; CHECK-NEXT:  .LBB19_3:
390  %c = icmp ult i32 %v, 12
391  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
392  %bc = icmp sgt i32 %ballot, 22
393  br i1 %bc, label %true, label %false
394true:
395  ret i32 42
396false:
397  ret i32 33
398}
399
400declare i32 @llvm.amdgcn.icmp.i32(i1, i1, i32)
401
402define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
403; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
404; CHECK:       ; %bb.0:
405; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
406; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
407; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
408; CHECK-NEXT:    s_cbranch_vccnz .LBB20_2
409; CHECK-NEXT:  ; %bb.1: ; %true
410; CHECK-NEXT:    s_mov_b32 s0, 42
411; CHECK-NEXT:    s_branch .LBB20_3
412; CHECK-NEXT:  .LBB20_2: ; %false
413; CHECK-NEXT:    s_mov_b32 s0, 33
414; CHECK-NEXT:    s_branch .LBB20_3
415; CHECK-NEXT:  .LBB20_3:
416  %v1c = icmp ult i32 %v1, 12
417  %v2c = icmp ugt i32 %v2, 34
418  %c = and i1 %v1c, %v2c
419  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
420  %ballot_ne_zero = icmp ne i32 %ballot, 0
421  br i1 %ballot_ne_zero, label %true, label %false
422true:
423  ret i32 42
424false:
425  ret i32 33
426}
427
428define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
429; TODO:
430;   s_cmp_lt_u32 s0, 12
431;   s_cselect_b32 s0, -1, 0
432;   s_cmp_gt_u32 s1, 34
433;   s_cselect_b32 s1, -1, 0
434;   s_and_b32 s0, s0, s1
435;   s_and_b32 s0, s0, exec_lo
436; could be improved to:
437;   s_cmp_lt_u32 s0, 12
438;   s_cselect_b32 s0, -1, 0
439;   s_cmp_gt_u32 s1, 34
440;   s_cselect_b32 s0, s0, 0
441;   s_and_b32 s0, s0, exec_lo
442; By selecting into vcc(_lo) instead, we could even avoid the AND-with-exec.
443; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
444; CHECK:       ; %bb.0:
445; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
446; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
447; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
448; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
449; CHECK-NEXT:    s_and_b32 s0, s0, s1
450; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
451; CHECK-NEXT:    s_cbranch_scc1 .LBB21_2
452; CHECK-NEXT:  ; %bb.1: ; %true
453; CHECK-NEXT:    s_mov_b32 s0, 42
454; CHECK-NEXT:    s_branch .LBB21_3
455; CHECK-NEXT:  .LBB21_2: ; %false
456; CHECK-NEXT:    s_mov_b32 s0, 33
457; CHECK-NEXT:    s_branch .LBB21_3
458; CHECK-NEXT:  .LBB21_3:
459  %v1c = icmp ult i32 %v1, 12
460  %v2c = icmp ugt i32 %v2, 34
461  %c = and i1 %v1c, %v2c
462  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
463  %ballot_ne_zero = icmp ne i32 %ballot, 0
464  br i1 %ballot_ne_zero, label %true, label %false
465true:
466  ret i32 42
467false:
468  ret i32 33
469}
470
471define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
472; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
473; CHECK:       ; %bb.0:
474; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
475; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
476; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
477; CHECK-NEXT:    s_cbranch_vccnz .LBB22_2
478; CHECK-NEXT:  ; %bb.1: ; %false
479; CHECK-NEXT:    s_mov_b32 s0, 33
480; CHECK-NEXT:    s_branch .LBB22_3
481; CHECK-NEXT:  .LBB22_2: ; %true
482; CHECK-NEXT:    s_mov_b32 s0, 42
483; CHECK-NEXT:    s_branch .LBB22_3
484; CHECK-NEXT:  .LBB22_3:
485  %v1c = icmp ult i32 %v1, 12
486  %v2c = icmp ugt i32 %v2, 34
487  %c = and i1 %v1c, %v2c
488  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
489  %ballot_eq_zero = icmp eq i32 %ballot, 0
490  br i1 %ballot_eq_zero, label %true, label %false
491true:
492  ret i32 42
493false:
494  ret i32 33
495}
496
497define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
498; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
499; CHECK:       ; %bb.0:
500; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
501; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
502; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
503; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
504; CHECK-NEXT:    s_and_b32 s0, s0, s1
505; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
506; CHECK-NEXT:    s_cbranch_scc1 .LBB23_2
507; CHECK-NEXT:  ; %bb.1: ; %false
508; CHECK-NEXT:    s_mov_b32 s0, 33
509; CHECK-NEXT:    s_branch .LBB23_3
510; CHECK-NEXT:  .LBB23_2: ; %true
511; CHECK-NEXT:    s_mov_b32 s0, 42
512; CHECK-NEXT:    s_branch .LBB23_3
513; CHECK-NEXT:  .LBB23_3:
514  %v1c = icmp ult i32 %v1, 12
515  %v2c = icmp ugt i32 %v2, 34
516  %c = and i1 %v1c, %v2c
517  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
518  %ballot_eq_zero = icmp eq i32 %ballot, 0
519  br i1 %ballot_eq_zero, label %true, label %false
520true:
521  ret i32 42
522false:
523  ret i32 33
524}
525
526; Input that is not constant or direct result of a compare.
527; Tests setting 0 to inactive lanes.
528define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
529; GFX10-LABEL: non_cst_non_compare_input:
530; GFX10:       ; %bb.0: ; %entry
531; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
532; GFX10-NEXT:    ; implicit-def: $sgpr0
533; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
534; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
535; GFX10-NEXT:  ; %bb.1: ; %B
536; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
537; GFX10-NEXT:    ; implicit-def: $vgpr2
538; GFX10-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
539; GFX10-NEXT:  ; %bb.2: ; %Flow
540; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
541; GFX10-NEXT:  ; %bb.3: ; %A
542; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
543; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
544; GFX10-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
545; GFX10-NEXT:    s_or_b32 s0, s0, s2
546; GFX10-NEXT:  ; %bb.4: ; %exit
547; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
548; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
549; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
550; GFX10-NEXT:    v_mov_b32_e32 v2, s0
551; GFX10-NEXT:    global_store_dword v[0:1], v2, off
552; GFX10-NEXT:    s_endpgm
553;
554; GFX11-LABEL: non_cst_non_compare_input:
555; GFX11:       ; %bb.0: ; %entry
556; GFX11-NEXT:    s_mov_b32 s1, exec_lo
557; GFX11-NEXT:    ; implicit-def: $sgpr0
558; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
559; GFX11-NEXT:    s_xor_b32 s1, exec_lo, s1
560; GFX11-NEXT:  ; %bb.1: ; %B
561; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
562; GFX11-NEXT:    ; implicit-def: $vgpr2
563; GFX11-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
564; GFX11-NEXT:  ; %bb.2: ; %Flow
565; GFX11-NEXT:    s_and_not1_saveexec_b32 s1, s1
566; GFX11-NEXT:  ; %bb.3: ; %A
567; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
568; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
569; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
570; GFX11-NEXT:    s_or_b32 s0, s0, s2
571; GFX11-NEXT:  ; %bb.4: ; %exit
572; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
573; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
574; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
575; GFX11-NEXT:    v_mov_b32_e32 v2, s0
576; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
577; GFX11-NEXT:    s_endpgm
578entry:
579  %cmp = icmp eq i32 %cond, 0
580  br i1 %cmp, label %A, label %B
581
582A:
583  %val_A = icmp uge i32 %tid, 1
584  br label %exit
585
586B:
587  %val_B = icmp ult i32 %tid, 2
588  br label %exit
589
590exit:
591  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
592  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
593  store i32 %ballot, ptr addrspace(1) %out
594  ret void
595}
596