xref: /llvm-project/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll (revision 40fa7f5e8b315159d45aa280c771af5998bdc75e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
4
5define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
6; SI-LABEL: uniform_kill:
7; SI:       ; %bb.0: ; %entry
8; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
9; SI-NEXT:    s_mov_b64 s[0:1], exec
10; SI-NEXT:    s_mov_b64 s[2:3], -1
11; SI-NEXT:    v_or_b32_e32 v0, v1, v0
12; SI-NEXT:    v_and_b32_e32 v0, 1, v0
13; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
14; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
15; SI-NEXT:  ; %bb.1: ; %if1
16; SI-NEXT:    s_xor_b64 s[2:3], exec, -1
17; SI-NEXT:  ; %bb.2: ; %endif1
18; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
19; SI-NEXT:    s_wqm_b64 s[4:5], s[2:3]
20; SI-NEXT:    s_andn2_b64 s[4:5], exec, s[4:5]
21; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
22; SI-NEXT:    s_cbranch_scc0 .LBB0_6
23; SI-NEXT:  ; %bb.3: ; %endif1
24; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
25; SI-NEXT:    v_mov_b32_e32 v0, 0
26; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
27; SI-NEXT:    s_cbranch_execz .LBB0_5
28; SI-NEXT:  ; %bb.4: ; %if2
29; SI-NEXT:    s_mov_b32 s3, 0
30; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
31; SI-NEXT:    v_add_f32_e32 v0, 1.0, v2
32; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
33; SI-NEXT:    s_waitcnt lgkmcnt(0)
34; SI-NEXT:    s_mov_b32 s6, s4
35; SI-NEXT:    s_mov_b32 s7, s5
36; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc
37; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
38; SI-NEXT:    v_cvt_f32_i32_e32 v0, v0
39; SI-NEXT:  .LBB0_5: ; %endif2
40; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
41; SI-NEXT:    s_branch .LBB0_7
42; SI-NEXT:  .LBB0_6:
43; SI-NEXT:    s_mov_b64 exec, 0
44; SI-NEXT:    exp null off, off, off, off done vm
45; SI-NEXT:    s_endpgm
46; SI-NEXT:  .LBB0_7:
47;
48; FLAT-LABEL: uniform_kill:
49; FLAT:       ; %bb.0: ; %entry
50; FLAT-NEXT:    v_cvt_i32_f32_e32 v0, v0
51; FLAT-NEXT:    s_mov_b64 s[0:1], exec
52; FLAT-NEXT:    s_mov_b64 s[2:3], -1
53; FLAT-NEXT:    v_or_b32_e32 v0, v1, v0
54; FLAT-NEXT:    v_and_b32_e32 v0, 1, v0
55; FLAT-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
56; FLAT-NEXT:    s_and_saveexec_b64 s[4:5], vcc
57; FLAT-NEXT:  ; %bb.1: ; %if1
58; FLAT-NEXT:    s_xor_b64 s[2:3], exec, -1
59; FLAT-NEXT:  ; %bb.2: ; %endif1
60; FLAT-NEXT:    s_or_b64 exec, exec, s[4:5]
61; FLAT-NEXT:    s_wqm_b64 s[4:5], s[2:3]
62; FLAT-NEXT:    s_andn2_b64 s[4:5], exec, s[4:5]
63; FLAT-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
64; FLAT-NEXT:    s_cbranch_scc0 .LBB0_6
65; FLAT-NEXT:  ; %bb.3: ; %endif1
66; FLAT-NEXT:    s_and_b64 exec, exec, s[0:1]
67; FLAT-NEXT:    v_mov_b32_e32 v0, 0
68; FLAT-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
69; FLAT-NEXT:    s_cbranch_execz .LBB0_5
70; FLAT-NEXT:  ; %bb.4: ; %if2
71; FLAT-NEXT:    s_mov_b32 s3, 0
72; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
73; FLAT-NEXT:    v_add_f32_e32 v0, 1.0, v2
74; FLAT-NEXT:    v_cvt_i32_f32_e32 v0, v0
75; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
76; FLAT-NEXT:    s_mov_b32 s6, s4
77; FLAT-NEXT:    s_mov_b32 s7, s5
78; FLAT-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc
79; FLAT-NEXT:    s_waitcnt vmcnt(0)
80; FLAT-NEXT:    v_cvt_f32_i32_e32 v0, v0
81; FLAT-NEXT:  .LBB0_5: ; %endif2
82; FLAT-NEXT:    s_or_b64 exec, exec, s[0:1]
83; FLAT-NEXT:    s_branch .LBB0_7
84; FLAT-NEXT:  .LBB0_6:
85; FLAT-NEXT:    s_mov_b64 exec, 0
86; FLAT-NEXT:    exp null off, off, off, off done vm
87; FLAT-NEXT:    s_endpgm
88; FLAT-NEXT:  .LBB0_7:
89entry:
90  %.1 = fptosi float %a to i32
91  %.2 = or i32 %b, %.1
92  %.3 = and i32 %.2, 1
93  %.not = icmp eq i32 %.3, 0
94  br i1 %.not, label %endif1, label %if1
95
96if1:
97  br i1 false, label %if3, label %endif1
98
99if3:
100  br label %endif1
101
102endif1:
103  %.0 = phi i1 [ false, %if3 ], [ false, %if1 ], [ true, %entry ]
104  %.4 = call i1 @llvm.amdgcn.wqm.vote(i1 %.0)
105  ; This kill must be uniformly executed
106  call void @llvm.amdgcn.kill(i1 %.4)
107  %.test0 = fadd nsz arcp float %c, 1.0
108  %.test1 = fptosi float %.test0 to i32
109  br i1 %.0, label %if2, label %endif2
110
111if2:
112  %.5 = getelementptr inbounds ptr addrspace(8), ptr addrspace(6) undef, i32 31, !amdgpu.uniform !0
113  %.6 = load ptr addrspace(8), ptr addrspace(6) %.5, align 16, !invariant.load !0
114  %.7 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32 %.test1, ptr addrspace(8) %.6, i32 4, i32 0, i32 0)
115  %.8 = sitofp i32 %.7 to float
116  br label %endif2
117
118endif2:
119  %.9 = phi float [ %.8, %if2 ], [ 0.0, %endif1 ]
120  ret float %.9
121}
122
123
124declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32 immarg) #2
125declare i1 @llvm.amdgcn.wqm.vote(i1) #3
126declare void @llvm.amdgcn.kill(i1) #4
127declare float @llvm.amdgcn.wqm.f32(float) #1
128
129attributes #1 = { nounwind readnone speculatable willreturn }
130attributes #2 = { nounwind willreturn memory(argmem: readwrite) }
131attributes #3 = { convergent nounwind readnone willreturn }
132attributes #4 = { nounwind }
133
134!0 = !{}
135