xref: /llvm-project/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll (revision 29441e4f5fa5f5c7709f7cf180815ba97f611297)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s
3
4define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
5; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout(
6; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
7; GFX906-NEXT:  entry:
8; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
9; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
10; GFX906-NEXT:    [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4
11; GFX906-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
12; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
13; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
14; GFX906-NEXT:    [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4
15; GFX906-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
16; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
17; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
18; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
19; GFX906:       bb.1:
20; GFX906-NEXT:    br label [[BB_2]]
21; GFX906:       bb.2:
22; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
23; GFX906-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24
24; GFX906-NEXT:    [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
25; GFX906-NEXT:    store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
26; GFX906-NEXT:    ret void
27;
28entry:
29  %idx = call i32 @llvm.amdgcn.workitem.id.x()
30  %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
31  %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
32  %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
33  %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
34  %cmp = icmp ult i32 %idx, 15
35  br i1 %cmp, label %bb.1, label %bb.2
36bb.1:
37  br label %bb.2
38
39bb.2:
40  %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
41  store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
42  ret void
43}
44
45define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
46; GFX906-LABEL: define amdgpu_kernel void @v4i8_liveout(
47; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
48; GFX906-NEXT:  entry:
49; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
50; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
51; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
52; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
53; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
54; GFX906-NEXT:    [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
55; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
56; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
57; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
58; GFX906:       bb.1:
59; GFX906-NEXT:    br label [[BB_2]]
60; GFX906:       bb.2:
61; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
62; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
63; GFX906-NEXT:    store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
64; GFX906-NEXT:    ret void
65;
66entry:
67  %idx = call i32 @llvm.amdgcn.workitem.id.x()
68  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
69  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
70  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
71  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
72  %cmp = icmp ult i32 %idx, 15
73  br i1 %cmp, label %bb.1, label %bb.2
74bb.1:
75  br label %bb.2
76
77bb.2:
78  %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
79  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
80  ret void
81}
82
83define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
84; GFX906-LABEL: define amdgpu_kernel void @v5i8_liveout(
85; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
86; GFX906-NEXT:  entry:
87; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
88; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
89; GFX906-NEXT:    [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8
90; GFX906-NEXT:    [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
91; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
92; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
93; GFX906-NEXT:    [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8
94; GFX906-NEXT:    [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
95; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
96; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
97; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
98; GFX906:       bb.1:
99; GFX906-NEXT:    br label [[BB_2]]
100; GFX906:       bb.2:
101; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
102; GFX906-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
103; GFX906-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
104; GFX906-NEXT:    store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
105; GFX906-NEXT:    ret void
106;
107entry:
108  %idx = call i32 @llvm.amdgcn.workitem.id.x()
109  %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
110  %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
111  %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
112  %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
113  %cmp = icmp ult i32 %idx, 15
114  br i1 %cmp, label %bb.1, label %bb.2
115bb.1:
116  br label %bb.2
117
118bb.2:
119  %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
120  store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
121  ret void
122}
123
124define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
125; GFX906-LABEL: define amdgpu_kernel void @v8i8_liveout(
126; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
127; GFX906-NEXT:  entry:
128; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
129; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
130; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
131; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
132; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
133; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
134; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
135; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
136; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
137; GFX906:       bb.1:
138; GFX906-NEXT:    br label [[BB_2]]
139; GFX906:       bb.2:
140; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
141; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
142; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
143; GFX906-NEXT:    ret void
144;
145entry:
146  %idx = call i32 @llvm.amdgcn.workitem.id.x()
147  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
148  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
149  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
150  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
151  %cmp = icmp ult i32 %idx, 15
152  br i1 %cmp, label %bb.1, label %bb.2
153bb.1:
154  br label %bb.2
155
156bb.2:
157  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
158  store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
159  ret void
160}
161
162define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
163; GFX906-LABEL: define amdgpu_kernel void @repeat_successor(
164; GFX906-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
165; GFX906-NEXT:  entry:
166; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
167; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
168; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
169; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
170; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
171; GFX906-NEXT:    [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
172; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
173; GFX906-NEXT:    switch i32 [[IN]], label [[RETURN:%.*]] [
174; GFX906-NEXT:      i32 1, label [[RETURN_SINK_SPLIT:%.*]]
175; GFX906-NEXT:      i32 2, label [[RETURN_SINK_SPLIT]]
176; GFX906-NEXT:      i32 3, label [[SW_BB5:%.*]]
177; GFX906-NEXT:    ]
178; GFX906:       sw.bb5:
179; GFX906-NEXT:    br label [[RETURN_SINK_SPLIT]]
180; GFX906:       return.sink.split:
181; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
182; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
183; GFX906-NEXT:    store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
184; GFX906-NEXT:    ret void
185; GFX906:       return:
186; GFX906-NEXT:    ret void
187;
188entry:
189  %idx = call i32 @llvm.amdgcn.workitem.id.x()
190  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
191  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
192  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
193  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
194  switch i32 %in, label %return [
195  i32 1, label %return.sink.split
196  i32 2, label %return.sink.split
197  i32 3, label %sw.bb5
198  ]
199
200sw.bb5:
201  br label %return.sink.split
202
203return.sink.split:
204  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
205  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
206  ret void
207
208return:
209  ret void
210}
211
212define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
213; GFX906-LABEL: define amdgpu_kernel void @v8i8_phi_chain(
214; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST0:%.*]], ptr addrspace(1) captures(none) [[DST1:%.*]]) #[[ATTR0]] {
215; GFX906-NEXT:  entry:
216; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
217; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
218; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
219; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
220; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
221; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
222; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
223; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
224; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
225; GFX906:       bb.1:
226; GFX906-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
227; GFX906-NEXT:    br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]]
228; GFX906:       bb.2:
229; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
230; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
231; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST0]], align 4
232; GFX906-NEXT:    br label [[BB_3]]
233; GFX906:       bb.3:
234; GFX906-NEXT:    [[TMP7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[TMP5_TC]], [[BB_2]] ]
235; GFX906-NEXT:    [[TMP7_TC_BC:%.*]] = bitcast <2 x i32> [[TMP7_TC]] to <8 x i8>
236; GFX906-NEXT:    store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4
237; GFX906-NEXT:    ret void
238;
239entry:
240  %idx = call i32 @llvm.amdgcn.workitem.id.x()
241  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
242  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
243  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
244  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
245  %cmp = icmp ult i32 %idx, 15
246  br i1 %cmp, label %bb.1, label %bb.2
247bb.1:
248  %cmp2 = icmp ult i32 %idx, 7
249  br i1 %cmp2, label %bb.2, label %bb.3
250
251bb.2:
252  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
253  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
254  br label %bb.3
255
256bb.3:
257  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
258  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
259  ret void
260}
261
262define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
263; GFX906-LABEL: define amdgpu_kernel void @v8i8_multi_block(
264; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST0:%.*]], ptr addrspace(1) captures(none) [[DST1:%.*]]) #[[ATTR0]] {
265; GFX906-NEXT:  entry:
266; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
267; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
268; GFX906-NEXT:    [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
269; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
270; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
271; GFX906-NEXT:    [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
272; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
273; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
274; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]]
275; GFX906:       bb.1:
276; GFX906-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
277; GFX906-NEXT:    br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]]
278; GFX906:       bb.2:
279; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8>
280; GFX906-NEXT:    store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4
281; GFX906-NEXT:    br label [[BB_3]]
282; GFX906:       bb.3:
283; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
284; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
285; GFX906-NEXT:    store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4
286; GFX906-NEXT:    ret void
287;
288entry:
289  %idx = call i32 @llvm.amdgcn.workitem.id.x()
290  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
291  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
292  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
293  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
294  %cmp = icmp ult i32 %idx, 15
295  br i1 %cmp, label %bb.1, label %bb.3
296bb.1:
297  %cmp2 = icmp ult i32 %idx, 7
298  br i1 %cmp2, label %bb.2, label %bb.3
299
300bb.2:
301  store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
302  br label %bb.3
303
304bb.3:
305  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
306  store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
307  ret void
308}
309
310define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
311; GFX906-LABEL: define amdgpu_kernel void @v32i8_loop_carried(
312; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
313; GFX906-NEXT:  entry:
314; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
315; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
316; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
317; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
318; GFX906-NEXT:    br label [[BB_1:%.*]]
319; GFX906:       bb.1:
320; GFX906-NEXT:    [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
321; GFX906-NEXT:    [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
322; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
323; GFX906-NEXT:    [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
324; GFX906-NEXT:    [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32
325; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
326; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]]
327; GFX906:       0:
328; GFX906-NEXT:    br label [[BB_2]]
329; GFX906:       bb.2:
330; GFX906-NEXT:    [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8>
331; GFX906-NEXT:    store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4
332; GFX906-NEXT:    ret void
333;
334entry:
335  %idx = call i32 @llvm.amdgcn.workitem.id.x()
336  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
337  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
338  br label %bb.1
339
340bb.1:
341  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
342  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
343  %cmp = icmp ult i32 %idx, 15
344  br i1 %cmp, label %bb.1, label %bb.2
345  br label %bb.2
346
347bb.2:
348  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
349  ret void
350}
351
352; Should not produce a broken phi
353
354define void @broken_phi() {
355; GFX906-LABEL: define void @broken_phi(
356; GFX906-SAME: ) #[[ATTR0]] {
357; GFX906-NEXT:  bb:
358; GFX906-NEXT:    br label [[BB1:%.*]]
359; GFX906:       bb1:
360; GFX906-NEXT:    [[I:%.*]] = phi <4 x i8> [ splat (i8 1), [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ]
361; GFX906-NEXT:    br i1 false, label [[BB3:%.*]], label [[BB2:%.*]]
362; GFX906:       bb2:
363; GFX906-NEXT:    br label [[BB3]]
364; GFX906:       bb3:
365; GFX906-NEXT:    [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ]
366; GFX906-NEXT:    br i1 false, label [[BB7]], label [[BB5:%.*]]
367; GFX906:       bb5:
368; GFX906-NEXT:    [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer)
369; GFX906-NEXT:    br label [[BB7]]
370; GFX906:       bb7:
371; GFX906-NEXT:    [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ]
372; GFX906-NEXT:    br label [[BB1]]
373;
374bb:
375  br label %bb1
376bb1:
377  %i = phi <4 x i8> [ <i8 1, i8 1, i8 1, i8 1>, %bb ], [ %i8, %bb7 ]
378  br i1 false, label %bb3, label %bb2
379bb2:
380  br label %bb3
381bb3:
382  %i4 = phi <4 x i8> [ zeroinitializer, %bb2 ], [ %i, %bb1 ]
383  br i1 false, label %bb7, label %bb5
384bb5:
385  %i6 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %i4, <4 x i8> zeroinitializer)
386  br label %bb7
387bb7:
388  %i8 = phi <4 x i8> [ zeroinitializer, %bb5 ], [ zeroinitializer, %bb3 ]
389  br label %bb1
390}
391
392; %sel1 should just use %sel0 instead of trying to convert back the
393; converted version of %sel0
394
395define amdgpu_kernel void @reuseOp() {
396; GFX906-LABEL: define amdgpu_kernel void @reuseOp(
397; GFX906-SAME: ) #[[ATTR0]] {
398; GFX906-NEXT:  entry:
399; GFX906-NEXT:    [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
400; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32>
401; GFX906-NEXT:    br label [[BB_1:%.*]]
402; GFX906:       bb.1:
403; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8>
404; GFX906-NEXT:    [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
405; GFX906-NEXT:    [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32>
406; GFX906-NEXT:    [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]]
407; GFX906-NEXT:    br label [[BB_2:%.*]]
408; GFX906:       bb.2:
409; GFX906-NEXT:    [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8>
410; GFX906-NEXT:    [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0
411; GFX906-NEXT:    ret void
412;
413entry:
414  %vec1 = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
415  br label %bb.1
416
417bb.1:
418  %sel0 = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
419  %sel1 = select i1 false, <16 x i8> %vec1, <16 x i8> %sel0
420  br label %bb.2
421
422bb.2:
423  %val = extractelement <16 x i8> %sel0, i64 0
424  ret void
425}
426
427
428define amdgpu_kernel void @deletedPHI(i32 %in0, i1 %cmp, <10 x i8> %invec0) {
429; GFX906-LABEL: define amdgpu_kernel void @deletedPHI(
430; GFX906-SAME: i32 [[IN0:%.*]], i1 [[CMP:%.*]], <10 x i8> [[INVEC0:%.*]]) #[[ATTR0]] {
431; GFX906-NEXT:  entry:
432; GFX906-NEXT:    br label [[BB_1:%.*]]
433; GFX906:       bb.1:
434; GFX906-NEXT:    [[PHI0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB_11:%.*]] ]
435; GFX906-NEXT:    [[PHI1:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY]] ], [ [[VEC1:%.*]], [[BB_11]] ]
436; GFX906-NEXT:    br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]]
437; GFX906:       bb.2:
438; GFX906-NEXT:    br label [[BB_3]]
439; GFX906:       bb.3:
440; GFX906-NEXT:    [[PHI2:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI1]], [[BB_1]] ]
441; GFX906-NEXT:    br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]]
442; GFX906:       bb.4:
443; GFX906-NEXT:    [[VEC0:%.*]] = insertelement <10 x i8> [[PHI2]], i8 0, i64 0
444; GFX906-NEXT:    br label [[BB_5]]
445; GFX906:       bb.5:
446; GFX906-NEXT:    [[PHI3:%.*]] = phi <10 x i8> [ [[VEC0]], [[BB_4]] ], [ [[PHI2]], [[BB_3]] ]
447; GFX906-NEXT:    br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]]
448; GFX906:       bb.6:
449; GFX906-NEXT:    br label [[BB_7]]
450; GFX906:       bb.7:
451; GFX906-NEXT:    [[PHI4:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_6]] ], [ [[PHI3]], [[BB_5]] ]
452; GFX906-NEXT:    br i1 [[CMP]], label [[BB_9:%.*]], label [[BB_8:%.*]]
453; GFX906:       bb.8:
454; GFX906-NEXT:    br label [[BB_9]]
455; GFX906:       bb.9:
456; GFX906-NEXT:    [[PHI5:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_8]] ], [ [[PHI4]], [[BB_7]] ]
457; GFX906-NEXT:    br i1 [[CMP]], label [[BB_11]], label [[BB_10:%.*]]
458; GFX906:       bb.10:
459; GFX906-NEXT:    br label [[BB_11]]
460; GFX906:       bb.11:
461; GFX906-NEXT:    [[PHI6:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_10]] ], [ [[PHI5]], [[BB_9]] ]
462; GFX906-NEXT:    [[VEC1]] = shufflevector <10 x i8> [[PHI6]], <10 x i8> zeroinitializer, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 15, i32 16, i32 17, i32 18, i32 19>
463; GFX906-NEXT:    br label [[BB_1]]
464;
465entry:
466  br label %bb.1
467
468bb.1:
469  %phi0 = phi i32 [ 0, %entry ], [ 1, %bb.11 ]
470  %phi1 = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %vec1, %bb.11 ]
471  br i1 %cmp, label %bb.3, label %bb.2
472
473bb.2:
474  br label %bb.3
475
476bb.3:
477  %phi2 = phi <10 x i8> [ zeroinitializer, %bb.2 ], [ %phi1, %bb.1 ]
478  br i1 %cmp, label %bb.5, label %bb.4
479
480bb.4:
481  %vec0 = insertelement <10 x i8> %phi2, i8 0, i64 0
482  br label %bb.5
483
484bb.5:                               ; preds = %bb.4, %bb.3
485  %phi3 = phi <10 x i8> [ %vec0, %bb.4 ], [ %phi2, %bb.3 ]
486  br i1 %cmp, label %bb.7, label %bb.6
487
488bb.6:
489  br label %bb.7
490
491bb.7:                               ; preds = %bb.6, %bb.5
492  %phi4 = phi <10 x i8> [ %invec0, %bb.6 ], [ %phi3, %bb.5 ]
493  br i1 %cmp, label %bb.9, label %bb.8
494
495bb.8:
496  br label %bb.9
497
498bb.9:
499  %phi5 = phi <10 x i8> [ %invec0, %bb.8 ], [ %phi4, %bb.7 ]
500  br i1 %cmp, label %bb.11, label %bb.10
501
502bb.10:
503  br label %bb.11
504
505bb.11:
506  %phi6 = phi <10 x i8> [ zeroinitializer, %bb.10 ], [ %phi5, %bb.9 ]
507  %vec1 = shufflevector <10 x i8> %phi6, <10 x i8> zeroinitializer, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 15, i32 16, i32 17, i32 18, i32 19>
508  br label %bb.1
509}
510
511define amdgpu_kernel void @multiple_unwind(i1 %cmp, <10 x i8> %invec) {
512; GFX906-LABEL: define amdgpu_kernel void @multiple_unwind(
513; GFX906-SAME: i1 [[CMP:%.*]], <10 x i8> [[INVEC:%.*]]) #[[ATTR0]] {
514; GFX906-NEXT:  entry:
515; GFX906-NEXT:    br label [[BB_1:%.*]]
516; GFX906:       bb.1:
517; GFX906-NEXT:    [[PHI0:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY:%.*]] ], [ [[PHI3:%.*]], [[BB_8:%.*]] ]
518; GFX906-NEXT:    br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]]
519; GFX906:       bb.2:
520; GFX906-NEXT:    br label [[BB_3]]
521; GFX906:       bb.3:
522; GFX906-NEXT:    [[PHI1:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI0]], [[BB_1]] ]
523; GFX906-NEXT:    br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]]
524; GFX906:       bb.4:
525; GFX906-NEXT:    br label [[BB_5]]
526; GFX906:       bb.5:
527; GFX906-NEXT:    [[PHI2:%.*]] = phi <10 x i8> [ [[PHI0]], [[BB_4]] ], [ [[PHI1]], [[BB_3]] ]
528; GFX906-NEXT:    br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]]
529; GFX906:       bb.6:
530; GFX906-NEXT:    br label [[BB_7]]
531; GFX906:       bb.7:
532; GFX906-NEXT:    [[PHI3]] = phi <10 x i8> [ [[INVEC]], [[BB_6]] ], [ [[PHI2]], [[BB_5]] ]
533; GFX906-NEXT:    br label [[BB_8]]
534; GFX906:       bb.8:
535; GFX906-NEXT:    br label [[BB_1]]
536;
537entry:
538  br label %bb.1
539
540bb.1:
541  %phi0 = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %phi3, %bb.8 ]
542  br i1 %cmp, label %bb.3, label %bb.2
543
544bb.2:
545  br label %bb.3
546
547bb.3:
548  %phi1 = phi <10 x i8> [ zeroinitializer, %bb.2 ], [ %phi0, %bb.1 ]
549  br i1 %cmp, label %bb.5, label %bb.4
550
551bb.4:
552  br label %bb.5
553
554bb.5:
555  %phi2 = phi <10 x i8> [ %phi0, %bb.4 ], [ %phi1, %bb.3 ]
556  br i1 %cmp, label %bb.7, label %bb.6
557
558bb.6:                              ; preds = %bb.5
559  br label %bb.7
560
561bb.7:
562  %phi3 = phi <10 x i8> [ %invec, %bb.6 ], [ %phi2, %bb.5 ]
563  br label %bb.8
564
565bb.8:
566  br label %bb.1
567}
568
569
570
571declare i32 @llvm.amdgcn.workitem.id.x()
572