xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll (revision 7b0d56be1d002e9cf0d8dda8ecaee99c5dbc88cf)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
3
4declare i64 @llvm.amdgcn.ballot.i64(i1)
5declare i64 @llvm.ctpop.i64(i64)
6
7; Test ballot(0)
8
9define amdgpu_cs i64 @constant_false() {
10; CHECK-LABEL: constant_false:
11; CHECK:       ; %bb.0:
12; CHECK-NEXT:    s_mov_b32 s0, 0
13; CHECK-NEXT:    s_mov_b32 s1, 0
14; CHECK-NEXT:    ; return to shader part epilog
15  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0)
16  ret i64 %ballot
17}
18
19; Test ballot(1)
20
21define amdgpu_cs i64 @constant_true() {
22; CHECK-LABEL: constant_true:
23; CHECK:       ; %bb.0:
24; CHECK-NEXT:    s_mov_b32 s0, exec_lo
25; CHECK-NEXT:    s_mov_b32 s1, exec_hi
26; CHECK-NEXT:    ; return to shader part epilog
27  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1)
28  ret i64 %ballot
29}
30
31; Test ballot of a non-comparison operation
32
33define amdgpu_cs i64 @non_compare(i32 %x) {
34; CHECK-LABEL: non_compare:
35; CHECK:       ; %bb.0:
36; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
37; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
38; CHECK-NEXT:    ; return to shader part epilog
39  %trunc = trunc i32 %x to i1
40  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
41  ret i64 %ballot
42}
43
44; Test ballot of comparisons
45
46define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) {
47; CHECK-LABEL: compare_ints:
48; CHECK:       ; %bb.0:
49; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], v0, v1
50; CHECK-NEXT:    ; return to shader part epilog
51  %cmp = icmp eq i32 %x, %y
52  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
53  ret i64 %ballot
54}
55
56define amdgpu_cs i64 @compare_int_with_constant(i32 %x) {
57; CHECK-LABEL: compare_int_with_constant:
58; CHECK:       ; %bb.0:
59; CHECK-NEXT:    s_movk_i32 s0, 0x62
60; CHECK-NEXT:    v_cmp_lt_i32_e64 s[0:1], s0, v0
61; CHECK-NEXT:    ; return to shader part epilog
62  %cmp = icmp sge i32 %x, 99
63  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
64  ret i64 %ballot
65}
66
67define amdgpu_cs i64 @compare_floats(float %x, float %y) {
68; CHECK-LABEL: compare_floats:
69; CHECK:       ; %bb.0:
70; CHECK-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
71; CHECK-NEXT:    ; return to shader part epilog
72  %cmp = fcmp ogt float %x, %y
73  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
74  ret i64 %ballot
75}
76
77define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
78; CHECK-LABEL: ctpop_of_ballot:
79; CHECK:       ; %bb.0:
80; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
81; CHECK-NEXT:    s_bcnt1_i32_b64 s0, vcc
82; CHECK-NEXT:    s_mov_b32 s1, 0
83; CHECK-NEXT:    ; return to shader part epilog
84  %cmp = fcmp ogt float %x, %y
85  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
86  %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
87  ret i64 %bcnt
88}
89
90define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
91; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
92; CHECK:       ; %bb.0:
93; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
94; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
95; CHECK-NEXT:    s_cbranch_vccz .LBB7_2
96; CHECK-NEXT:  ; %bb.1: ; %true
97; CHECK-NEXT:    s_mov_b32 s0, 42
98; CHECK-NEXT:    s_branch .LBB7_3
99; CHECK-NEXT:  .LBB7_2: ; %false
100; CHECK-NEXT:    s_mov_b32 s0, 33
101; CHECK-NEXT:    s_branch .LBB7_3
102; CHECK-NEXT:  .LBB7_3:
103  %c = trunc i32 %v to i1
104  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
105  %ballot_ne_zero = icmp ne i64 %ballot, 0
106  br i1 %ballot_ne_zero, label %true, label %false
107true:
108  ret i32 42
109false:
110  ret i32 33
111}
112
113define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
114; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
115; CHECK:       ; %bb.0:
116; CHECK-NEXT:    s_and_b32 s0, s0, 1
117; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc, s0, 0
118; CHECK-NEXT:    s_cbranch_vccz .LBB8_2
119; CHECK-NEXT:  ; %bb.1: ; %true
120; CHECK-NEXT:    s_mov_b32 s0, 42
121; CHECK-NEXT:    s_branch .LBB8_3
122; CHECK-NEXT:  .LBB8_2: ; %false
123; CHECK-NEXT:    s_mov_b32 s0, 33
124; CHECK-NEXT:    s_branch .LBB8_3
125; CHECK-NEXT:  .LBB8_3:
126  %c = trunc i32 %v to i1
127  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
128  %ballot_ne_zero = icmp ne i64 %ballot, 0
129  br i1 %ballot_ne_zero, label %true, label %false
130true:
131  ret i32 42
132false:
133  ret i32 33
134}
135
136define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
137; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
138; CHECK:       ; %bb.0:
139; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
140; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
141; CHECK-NEXT:    s_cbranch_vccz .LBB9_2
142; CHECK-NEXT:  ; %bb.1: ; %false
143; CHECK-NEXT:    s_mov_b32 s0, 33
144; CHECK-NEXT:    s_branch .LBB9_3
145; CHECK-NEXT:  .LBB9_2: ; %true
146; CHECK-NEXT:    s_mov_b32 s0, 42
147; CHECK-NEXT:    s_branch .LBB9_3
148; CHECK-NEXT:  .LBB9_3:
149  %c = trunc i32 %v to i1
150  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
151  %ballot_eq_zero = icmp eq i64 %ballot, 0
152  br i1 %ballot_eq_zero, label %true, label %false
153true:
154  ret i32 42
155false:
156  ret i32 33
157}
158
159define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
160; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
161; CHECK:       ; %bb.0:
162; CHECK-NEXT:    s_and_b32 s0, s0, 1
163; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc, s0, 0
164; CHECK-NEXT:    s_cbranch_vccz .LBB10_2
165; CHECK-NEXT:  ; %bb.1: ; %false
166; CHECK-NEXT:    s_mov_b32 s0, 33
167; CHECK-NEXT:    s_branch .LBB10_3
168; CHECK-NEXT:  .LBB10_2: ; %true
169; CHECK-NEXT:    s_mov_b32 s0, 42
170; CHECK-NEXT:    s_branch .LBB10_3
171; CHECK-NEXT:  .LBB10_3:
172  %c = trunc i32 %v to i1
173  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
174  %ballot_eq_zero = icmp eq i64 %ballot, 0
175  br i1 %ballot_eq_zero, label %true, label %false
176true:
177  ret i32 42
178false:
179  ret i32 33
180}
181
182define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
183; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
184; CHECK:       ; %bb.0:
185; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
186; CHECK-NEXT:    s_cbranch_vccz .LBB11_2
187; CHECK-NEXT:  ; %bb.1: ; %true
188; CHECK-NEXT:    s_mov_b32 s0, 42
189; CHECK-NEXT:    s_branch .LBB11_3
190; CHECK-NEXT:  .LBB11_2: ; %false
191; CHECK-NEXT:    s_mov_b32 s0, 33
192; CHECK-NEXT:    s_branch .LBB11_3
193; CHECK-NEXT:  .LBB11_3:
194  %c = icmp ult i32 %v, 12
195  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
196  %ballot_ne_zero = icmp ne i64 %ballot, 0
197  br i1 %ballot_ne_zero, label %true, label %false
198true:
199  ret i32 42
200false:
201  ret i32 33
202}
203
204define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
205; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
206; CHECK:       ; %bb.0:
207; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc, s0, 12
208; CHECK-NEXT:    s_cbranch_vccz .LBB12_2
209; CHECK-NEXT:  ; %bb.1: ; %true
210; CHECK-NEXT:    s_mov_b32 s0, 42
211; CHECK-NEXT:    s_branch .LBB12_3
212; CHECK-NEXT:  .LBB12_2: ; %false
213; CHECK-NEXT:    s_mov_b32 s0, 33
214; CHECK-NEXT:    s_branch .LBB12_3
215; CHECK-NEXT:  .LBB12_3:
216  %c = icmp ult i32 %v, 12
217  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
218  %ballot_ne_zero = icmp ne i64 %ballot, 0
219  br i1 %ballot_ne_zero, label %true, label %false
220true:
221  ret i32 42
222false:
223  ret i32 33
224}
225
226define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
227; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
228; CHECK:       ; %bb.0:
229; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
230; CHECK-NEXT:    s_cbranch_vccz .LBB13_2
231; CHECK-NEXT:  ; %bb.1: ; %false
232; CHECK-NEXT:    s_mov_b32 s0, 33
233; CHECK-NEXT:    s_branch .LBB13_3
234; CHECK-NEXT:  .LBB13_2: ; %true
235; CHECK-NEXT:    s_mov_b32 s0, 42
236; CHECK-NEXT:    s_branch .LBB13_3
237; CHECK-NEXT:  .LBB13_3:
238  %c = icmp ult i32 %v, 12
239  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
240  %ballot_eq_zero = icmp eq i64 %ballot, 0
241  br i1 %ballot_eq_zero, label %true, label %false
242true:
243  ret i32 42
244false:
245  ret i32 33
246}
247
248define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
249; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
250; CHECK:       ; %bb.0:
251; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc, s0, 12
252; CHECK-NEXT:    s_cbranch_vccz .LBB14_2
253; CHECK-NEXT:  ; %bb.1: ; %false
254; CHECK-NEXT:    s_mov_b32 s0, 33
255; CHECK-NEXT:    s_branch .LBB14_3
256; CHECK-NEXT:  .LBB14_2: ; %true
257; CHECK-NEXT:    s_mov_b32 s0, 42
258; CHECK-NEXT:    s_branch .LBB14_3
259; CHECK-NEXT:  .LBB14_3:
260  %c = icmp ult i32 %v, 12
261  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
262  %ballot_eq_zero = icmp eq i64 %ballot, 0
263  br i1 %ballot_eq_zero, label %true, label %false
264true:
265  ret i32 42
266false:
267  ret i32 33
268}
269
270define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
271; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
272; CHECK:       ; %bb.0:
273; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
274; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
275; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
276; CHECK-NEXT:    s_cbranch_vccz .LBB15_2
277; CHECK-NEXT:  ; %bb.1: ; %true
278; CHECK-NEXT:    s_mov_b32 s0, 42
279; CHECK-NEXT:    s_branch .LBB15_3
280; CHECK-NEXT:  .LBB15_2: ; %false
281; CHECK-NEXT:    s_mov_b32 s0, 33
282; CHECK-NEXT:    s_branch .LBB15_3
283; CHECK-NEXT:  .LBB15_3:
284  %v1c = icmp ult i32 %v1, 12
285  %v2c = icmp ugt i32 %v2, 34
286  %c = and i1 %v1c, %v2c
287  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
288  %ballot_ne_zero = icmp ne i64 %ballot, 0
289  br i1 %ballot_ne_zero, label %true, label %false
290true:
291  ret i32 42
292false:
293  ret i32 33
294}
295
296define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
297; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
298; CHECK:       ; %bb.0:
299; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
300; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
301; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
302; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
303; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
304; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
305; CHECK-NEXT:    s_cbranch_scc0 .LBB16_2
306; CHECK-NEXT:  ; %bb.1: ; %true
307; CHECK-NEXT:    s_mov_b32 s0, 42
308; CHECK-NEXT:    s_branch .LBB16_3
309; CHECK-NEXT:  .LBB16_2: ; %false
310; CHECK-NEXT:    s_mov_b32 s0, 33
311; CHECK-NEXT:    s_branch .LBB16_3
312; CHECK-NEXT:  .LBB16_3:
313  %v1c = icmp ult i32 %v1, 12
314  %v2c = icmp ugt i32 %v2, 34
315  %c = and i1 %v1c, %v2c
316  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
317  %ballot_ne_zero = icmp ne i64 %ballot, 0
318  br i1 %ballot_ne_zero, label %true, label %false
319true:
320  ret i32 42
321false:
322  ret i32 33
323}
324
325define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
326; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
327; CHECK:       ; %bb.0:
328; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
329; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
330; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
331; CHECK-NEXT:    s_cbranch_vccz .LBB17_2
332; CHECK-NEXT:  ; %bb.1: ; %false
333; CHECK-NEXT:    s_mov_b32 s0, 33
334; CHECK-NEXT:    s_branch .LBB17_3
335; CHECK-NEXT:  .LBB17_2: ; %true
336; CHECK-NEXT:    s_mov_b32 s0, 42
337; CHECK-NEXT:    s_branch .LBB17_3
338; CHECK-NEXT:  .LBB17_3:
339  %v1c = icmp ult i32 %v1, 12
340  %v2c = icmp ugt i32 %v2, 34
341  %c = and i1 %v1c, %v2c
342  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
343  %ballot_eq_zero = icmp eq i64 %ballot, 0
344  br i1 %ballot_eq_zero, label %true, label %false
345true:
346  ret i32 42
347false:
348  ret i32 33
349}
350
351define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
352; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
353; CHECK:       ; %bb.0:
354; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
355; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
356; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
357; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
358; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
359; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
360; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
361; CHECK-NEXT:  ; %bb.1: ; %false
362; CHECK-NEXT:    s_mov_b32 s0, 33
363; CHECK-NEXT:    s_branch .LBB18_3
364; CHECK-NEXT:  .LBB18_2: ; %true
365; CHECK-NEXT:    s_mov_b32 s0, 42
366; CHECK-NEXT:    s_branch .LBB18_3
367; CHECK-NEXT:  .LBB18_3:
368  %v1c = icmp ult i32 %v1, 12
369  %v2c = icmp ugt i32 %v2, 34
370  %c = and i1 %v1c, %v2c
371  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
372  %ballot_eq_zero = icmp eq i64 %ballot, 0
373  br i1 %ballot_eq_zero, label %true, label %false
374true:
375  ret i32 42
376false:
377  ret i32 33
378}
379
380define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
381; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
382; CHECK:       ; %bb.0:
383; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, 12
384; CHECK-NEXT:    v_cmp_lt_i64_e64 vcc, s[0:1], 23
385; CHECK-NEXT:    s_cbranch_vccnz .LBB19_2
386; CHECK-NEXT:  ; %bb.1: ; %true
387; CHECK-NEXT:    s_mov_b32 s0, 42
388; CHECK-NEXT:    s_branch .LBB19_3
389; CHECK-NEXT:  .LBB19_2: ; %false
390; CHECK-NEXT:    s_mov_b32 s0, 33
391; CHECK-NEXT:    s_branch .LBB19_3
392; CHECK-NEXT:  .LBB19_3:
393  %c = icmp ult i32 %v, 12
394  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
395  %bc = icmp sgt i64 %ballot, 22
396  br i1 %bc, label %true, label %false
397true:
398  ret i32 42
399false:
400  ret i32 33
401}
402
403declare i64 @llvm.amdgcn.icmp.i64(i1, i1, i32)
404
405define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
406; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
407; CHECK:       ; %bb.0:
408; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
409; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
410; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
411; CHECK-NEXT:    s_cbranch_vccnz .LBB20_2
412; CHECK-NEXT:  ; %bb.1: ; %true
413; CHECK-NEXT:    s_mov_b32 s0, 42
414; CHECK-NEXT:    s_branch .LBB20_3
415; CHECK-NEXT:  .LBB20_2: ; %false
416; CHECK-NEXT:    s_mov_b32 s0, 33
417; CHECK-NEXT:    s_branch .LBB20_3
418; CHECK-NEXT:  .LBB20_3:
419  %v1c = icmp ult i32 %v1, 12
420  %v2c = icmp ugt i32 %v2, 34
421  %c = and i1 %v1c, %v2c
422  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
423  %ballot_ne_zero = icmp ne i64 %ballot, 0
424  br i1 %ballot_ne_zero, label %true, label %false
425true:
426  ret i32 42
427false:
428  ret i32 33
429}
430
431define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
432; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
433; CHECK:       ; %bb.0:
434; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
435; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
436; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
437; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
438; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
439; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
440; CHECK-NEXT:    s_cbranch_scc1 .LBB21_2
441; CHECK-NEXT:  ; %bb.1: ; %true
442; CHECK-NEXT:    s_mov_b32 s0, 42
443; CHECK-NEXT:    s_branch .LBB21_3
444; CHECK-NEXT:  .LBB21_2: ; %false
445; CHECK-NEXT:    s_mov_b32 s0, 33
446; CHECK-NEXT:    s_branch .LBB21_3
447; CHECK-NEXT:  .LBB21_3:
448  %v1c = icmp ult i32 %v1, 12
449  %v2c = icmp ugt i32 %v2, 34
450  %c = and i1 %v1c, %v2c
451  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
452  %ballot_ne_zero = icmp ne i64 %ballot, 0
453  br i1 %ballot_ne_zero, label %true, label %false
454true:
455  ret i32 42
456false:
457  ret i32 33
458}
459
460define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
461; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
462; CHECK:       ; %bb.0:
463; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
464; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
465; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
466; CHECK-NEXT:    s_cbranch_vccnz .LBB22_2
467; CHECK-NEXT:  ; %bb.1: ; %false
468; CHECK-NEXT:    s_mov_b32 s0, 33
469; CHECK-NEXT:    s_branch .LBB22_3
470; CHECK-NEXT:  .LBB22_2: ; %true
471; CHECK-NEXT:    s_mov_b32 s0, 42
472; CHECK-NEXT:    s_branch .LBB22_3
473; CHECK-NEXT:  .LBB22_3:
474  %v1c = icmp ult i32 %v1, 12
475  %v2c = icmp ugt i32 %v2, 34
476  %c = and i1 %v1c, %v2c
477  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
478  %ballot_eq_zero = icmp eq i64 %ballot, 0
479  br i1 %ballot_eq_zero, label %true, label %false
480true:
481  ret i32 42
482false:
483  ret i32 33
484}
485
486define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
487; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
488; CHECK:       ; %bb.0:
489; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
490; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
491; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
492; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
493; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
494; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
495; CHECK-NEXT:    s_cbranch_scc1 .LBB23_2
496; CHECK-NEXT:  ; %bb.1: ; %false
497; CHECK-NEXT:    s_mov_b32 s0, 33
498; CHECK-NEXT:    s_branch .LBB23_3
499; CHECK-NEXT:  .LBB23_2: ; %true
500; CHECK-NEXT:    s_mov_b32 s0, 42
501; CHECK-NEXT:    s_branch .LBB23_3
502; CHECK-NEXT:  .LBB23_3:
503  %v1c = icmp ult i32 %v1, 12
504  %v2c = icmp ugt i32 %v2, 34
505  %c = and i1 %v1c, %v2c
506  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
507  %ballot_eq_zero = icmp eq i64 %ballot, 0
508  br i1 %ballot_eq_zero, label %true, label %false
509true:
510  ret i32 42
511false:
512  ret i32 33
513}
514
515; Input that is not constant or direct result of a compare.
516; Tests setting 0 to inactive lanes.
517define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
518; CHECK-LABEL: non_cst_non_compare_input:
519; CHECK:       ; %bb.0: ; %entry
520; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
521; CHECK-NEXT:    ; implicit-def: $sgpr0_sgpr1
522; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
523; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
524; CHECK-NEXT:  ; %bb.1: ; %B
525; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
526; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
527; CHECK-NEXT:    ; implicit-def: $vgpr2
528; CHECK-NEXT:  ; %bb.2: ; %Flow
529; CHECK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
530; CHECK-NEXT:  ; %bb.3: ; %A
531; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
532; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
533; CHECK-NEXT:    s_and_b64 s[4:5], vcc, exec
534; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
535; CHECK-NEXT:  ; %bb.4: ; %exit
536; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
537; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
538; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
539; CHECK-NEXT:    v_mov_b32_e32 v3, s1
540; CHECK-NEXT:    v_mov_b32_e32 v2, s0
541; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
542; CHECK-NEXT:    s_endpgm
543entry:
544  %cmp = icmp eq i32 %cond, 0
545  br i1 %cmp, label %A, label %B
546
547A:
548  %val_A = icmp uge i32 %tid, 1
549  br label %exit
550
551B:
552  %val_B = icmp ult i32 %tid, 2
553  br label %exit
554
555exit:
556  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
557  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
558  store i64 %ballot, ptr addrspace(1) %out
559  ret void
560}
561