xref: /llvm-project/llvm/test/CodeGen/AMDGPU/valu-i1.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; RUN: llc -mtriple=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
2
3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4
5; SI-LABEL: {{^}}test_if:
6; Make sure the i1 values created by the cfg structurizer pass are
7; moved using VALU instructions
8
9
10; waitcnt should be inserted after exec modification
11; SI:      v_cmp_lt_i32_e32 vcc, 1,
12; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
13; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
14; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
15; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
16; SI-NEXT: s_cbranch_execz [[FLOW_BB:.LBB[0-9]+_[0-9]+]]
17
18; SI-NEXT: ; %bb.{{[0-9]+}}: ; %LeafBlock3
19; SI:      s_mov_b64 s[{{[0-9]:[0-9]}}], -1
20; SI:      s_and_saveexec_b64
21; SI-NEXT: s_cbranch_execnz
22
23; v_mov should be after exec modification
24; SI: [[FLOW_BB]]:
25; SI-NEXT: s_andn2_saveexec_b64 [[SAVE2]], [[SAVE2]]
26;
27define amdgpu_kernel void @test_if(i32 %b, ptr addrspace(1) %src, ptr addrspace(1) %dst) #1 {
28entry:
29  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
30  switch i32 %tid, label %default [
31    i32 1, label %case1
32    i32 2, label %case2
33  ]
34
35case1:
36  %arrayidx1 = getelementptr i32, ptr addrspace(1) %dst, i32 %b
37  store i32 13, ptr addrspace(1) %arrayidx1, align 4
38  br label %end
39
40case2:
41  %arrayidx5 = getelementptr i32, ptr addrspace(1) %dst, i32 %b
42  store i32 17, ptr addrspace(1) %arrayidx5, align 4
43  br label %end
44
45default:
46  %cmp8 = icmp eq i32 %tid, 2
47  %arrayidx10 = getelementptr i32, ptr addrspace(1) %dst, i32 %b
48  br i1 %cmp8, label %if, label %else
49
50if:
51  store i32 19, ptr addrspace(1) %arrayidx10, align 4
52  br label %end
53
54else:
55  store i32 21, ptr addrspace(1) %arrayidx10, align 4
56  br label %end
57
58end:
59  ret void
60}
61
62; SI-LABEL: {{^}}simple_test_v_if:
63; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
64; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
65; SI-NEXT: s_cbranch_execz [[EXIT:.LBB[0-9]+_[0-9]+]]
66
67; SI-NEXT: ; %bb.{{[0-9]+}}:
68; SI: buffer_store_dword
69
70; SI-NEXT: {{^}}[[EXIT]]:
71; SI: s_endpgm
72define amdgpu_kernel void @simple_test_v_if(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 {
73  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
74  %is.0 = icmp ne i32 %tid, 0
75  br i1 %is.0, label %then, label %exit
76
77then:
78  %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid
79  store i32 999, ptr addrspace(1) %gep
80  br label %exit
81
82exit:
83  ret void
84}
85
86; FIXME: It would be better to endpgm in the then block.
87
88; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
89; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
90; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
91; SI-NEXT: s_cbranch_execz [[EXIT:.LBB[0-9]+_[0-9]+]]
92
93; SI-NEXT: ; %bb.{{[0-9]+}}:
94; SI: buffer_store_dword
95
96; SI-NEXT: {{^}}[[EXIT]]:
97; SI: s_endpgm
98define amdgpu_kernel void @simple_test_v_if_ret_else_ret(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 {
99  %tid = call i32 @llvm.amdgcn.workitem.id.x()
100  %is.0 = icmp ne i32 %tid, 0
101  br i1 %is.0, label %then, label %exit
102
103then:
104  %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid
105  store i32 999, ptr addrspace(1) %gep
106  ret void
107
108exit:
109  ret void
110}
111
112; Final block has more than a ret to execute. This was miscompiled
113; before function exit blocks were unified since the endpgm would
114; terminate the then wavefront before reaching the store.
115
116; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
117; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
118; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
119; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
120; SI: s_cbranch_execnz [[EXIT:.LBB[0-9]+_[0-9]+]]
121
122; SI-NEXT: {{^.LBB[0-9]+_[0-9]+}}: ; %Flow
123; SI-NEXT: s_andn2_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
124; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN:.LBB[0-9]+_[0-9]+]]
125
126; SI-NEXT: ; %bb.{{[0-9]+}}: ; %then
127; SI: s_waitcnt
128; SI-NEXT: buffer_store_dword
129
130; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
131; SI: s_endpgm
132
133; SI-NEXT: {{^}}[[EXIT]]:
134; SI: ds_write_b32
135define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 {
136  %tid = call i32 @llvm.amdgcn.workitem.id.x()
137  %is.0 = icmp ne i32 %tid, 0
138  br i1 %is.0, label %then, label %exit
139
140then:
141  %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid
142  store i32 999, ptr addrspace(1) %gep
143  ret void
144
145exit:
146  store volatile i32 7, ptr addrspace(3) undef
147  ret void
148}
149
150; SI-LABEL: {{^}}simple_test_v_loop:
151; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
152; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
153; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:.LBB[0-9]+_[0-9]+]]
154
155; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
156
157; SI: [[LABEL_LOOP:.LBB[0-9]+_[0-9]+]]:
158; SI: buffer_load_dword
159; SI-DAG: buffer_store_dword
160; SI-DAG: s_cmpk_lg_i32 s{{[0-9]+}}, 0x100
161; SI: s_cbranch_scc1 [[LABEL_LOOP]]
162; SI: [[LABEL_EXIT]]:
163; SI: s_endpgm
164define amdgpu_kernel void @simple_test_v_loop(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 {
165entry:
166  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
167  %is.0 = icmp ne i32 %tid, 0
168  %limit = add i32 %tid, 64
169  br i1 %is.0, label %loop, label %exit
170
171loop:
172  %i = phi i32 [%tid, %entry], [%i.inc, %loop]
173  %gep.src = getelementptr i32, ptr addrspace(1) %src, i32 %i
174  %gep.dst = getelementptr i32, ptr addrspace(1) %dst, i32 %i
175  %load = load i32, ptr addrspace(1) %src
176  store i32 %load, ptr addrspace(1) %gep.dst
177  %i.inc = add nsw i32 %i, 1
178  %cmp = icmp eq i32 %limit, %i.inc
179  br i1 %cmp, label %exit, label %loop
180
181exit:
182  ret void
183}
184
185; SI-LABEL: {{^}}multi_vcond_loop:
186
187; Load loop limit from buffer
188; Branch to exit if uniformly not taken
189; SI: ; %bb.0:
190; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
191; SI: v_cmp_lt_i32_e32 vcc
192; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
193; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:.LBB[0-9]+_[0-9]+]]
194
195; Initialize inner condition to false
196; SI: ; %bb.{{[0-9]+}}: ; %bb10.preheader
197; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}}
198
199; Clear exec bits for workitems that load -1s
200; SI: .L[[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
201; SI: buffer_load_dword [[B:v[0-9]+]]
202; SI: buffer_load_dword [[A:v[0-9]+]]
203; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
204; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
205; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
206; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
207; SI: s_cbranch_execz [[LABEL_FLOW:.LBB[0-9]+_[0-9]+]]
208
209; SI: ; %bb.{{[0-9]+}}: ; %bb20
210; SI: buffer_store_dword
211
212; SI: [[LABEL_FLOW]]:
213; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
214; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
215; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
216; SI-NEXT: s_or_b64 [[COND_STATE]], [[TMP1]], [[COND_STATE]]
217; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
218; SI-NEXT: s_cbranch_execnz .L[[LABEL_LOOP]]
219
220; SI: [[LABEL_EXIT]]:
221; SI-NOT: [[COND_STATE]]
222; SI: s_endpgm
223define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture readonly %arg2, ptr addrspace(1) noalias nocapture readonly %arg3) #1 {
224bb:
225  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
226  %tmp4 = sext i32 %tmp to i64
227  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg3, i64 %tmp4
228  %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
229  %tmp7 = icmp sgt i32 %tmp6, 0
230  %tmp8 = sext i32 %tmp6 to i64
231  br i1 %tmp7, label %bb10, label %bb26
232
233bb10:                                             ; preds = %bb, %bb20
234  %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
235  %tmp12 = add nsw i64 %tmp11, %tmp4
236  %tmp13 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp12
237  %tmp14 = load i32, ptr addrspace(1) %tmp13, align 4
238  %tmp15 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp12
239  %tmp16 = load i32, ptr addrspace(1) %tmp15, align 4
240  %tmp17 = icmp ne i32 %tmp14, -1
241  %tmp18 = icmp ne i32 %tmp16, -1
242  %tmp19 = and i1 %tmp17, %tmp18
243  br i1 %tmp19, label %bb20, label %bb26
244
245bb20:                                             ; preds = %bb10
246  %tmp21 = add nsw i32 %tmp16, %tmp14
247  %tmp22 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp12
248  store i32 %tmp21, ptr addrspace(1) %tmp22, align 4
249  %tmp23 = add nuw nsw i64 %tmp11, 1
250  %tmp24 = icmp slt i64 %tmp23, %tmp8
251  br i1 %tmp24, label %bb10, label %bb26
252
253bb26:                                             ; preds = %bb10, %bb20, %bb
254  ret void
255}
256
257attributes #0 = { nounwind readnone }
258attributes #1 = { nounwind }
259