xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
2
3; CHECK-LABEL: mul_v16i8
4; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
5; CHECK: vector.body:
6; CHECK: %index = phi i32
7; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
8; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
9; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
10; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
11; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
12; CHECK: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> {{.*}}, ptr {{.*}}, i32 4, <16 x i1> [[VCTP]])
13define dso_local arm_aapcs_vfpcc void @mul_v16i8(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
14entry:
15  %cmp8 = icmp eq i32 %N, 0
16  %tmp8 = add i32 %N, 15
17  %tmp9 = lshr i32 %tmp8, 4
18  %tmp10 = shl nuw i32 %tmp9, 4
19  %tmp11 = add i32 %tmp10, -16
20  %tmp12 = lshr i32 %tmp11, 4
21  %tmp13 = add nuw nsw i32 %tmp12, 1
22  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
23
24vector.ph:                                        ; preds = %entry
25  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
26  br label %vector.body
27
28vector.body:                                      ; preds = %vector.body, %vector.ph
29  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
30  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
31  %tmp = getelementptr inbounds i8, ptr %a, i32 %index
32  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
33  %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
34  %tmp3 = getelementptr inbounds i8, ptr %b, i32 %index
35  %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
36  %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
37  %tmp6 = getelementptr inbounds i8, ptr %c, i32 %index
38  tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul, ptr %tmp6, i32 4, <16 x i1> %active.lane.mask)
39  %index.next = add i32 %index, 16
40  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
41  %tmp16 = icmp ne i32 %tmp15, 0
42  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
43
44for.cond.cleanup:                                 ; preds = %vector.body, %entry
45  ret void
46}
47
48; CHECK-LABEL: mul_v8i16
49; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
50; CHECK: vector.body:
51; CHECK: %index = phi i32
52; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
53; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
54; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
55; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
56; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
57; CHECK: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> {{.*}}, ptr {{.*}}, i32 4, <8 x i1> [[VCTP]])
58define dso_local arm_aapcs_vfpcc void @mul_v8i16(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
59entry:
60  %cmp8 = icmp eq i32 %N, 0
61  %tmp8 = add i32 %N, 7
62  %tmp9 = lshr i32 %tmp8, 3
63  %tmp10 = shl nuw i32 %tmp9, 3
64  %tmp11 = add i32 %tmp10, -8
65  %tmp12 = lshr i32 %tmp11, 3
66  %tmp13 = add nuw nsw i32 %tmp12, 1
67  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
68
69vector.ph:                                        ; preds = %entry
70  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
71  br label %vector.body
72
73vector.body:                                      ; preds = %vector.body, %vector.ph
74  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
75  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
76  %tmp = getelementptr inbounds i16, ptr %a, i32 %index
77  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
78  %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
79  %tmp3 = getelementptr inbounds i16, ptr %b, i32 %index
80  %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp3, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
81  %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load
82  %tmp6 = getelementptr inbounds i16, ptr %c, i32 %index
83  tail call void @llvm.masked.store.v8i16.p0(<8 x i16> %mul, ptr %tmp6, i32 4, <8 x i1> %active.lane.mask)
84  %index.next = add i32 %index, 8
85  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
86  %tmp16 = icmp ne i32 %tmp15, 0
87  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
88
89for.cond.cleanup:                                 ; preds = %vector.body, %entry
90  ret void
91}
92
93; CHECK-LABEL: mul_v4i32
94; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
95; CHECK: vector.body:
96; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
97; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
98; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
99; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
100; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
101; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
102define dso_local arm_aapcs_vfpcc void @mul_v4i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
103entry:
104  %cmp8 = icmp eq i32 %N, 0
105  %tmp8 = add i32 %N, 3
106  %tmp9 = lshr i32 %tmp8, 2
107  %tmp10 = shl nuw i32 %tmp9, 2
108  %tmp11 = add i32 %tmp10, -4
109  %tmp12 = lshr i32 %tmp11, 2
110  %tmp13 = add nuw nsw i32 %tmp12, 1
111  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
112
113vector.ph:                                        ; preds = %entry
114  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
115  br label %vector.body
116
117vector.body:                                      ; preds = %vector.body, %vector.ph
118  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
119  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
120  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
121  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
122  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
123  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
124  %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
125  %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load
126  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
127  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %mul, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
128  %index.next = add i32 %index, 4
129  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
130  %tmp16 = icmp ne i32 %tmp15, 0
131  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
132
133for.cond.cleanup:                                 ; preds = %vector.body, %entry
134  ret void
135}
136
137; CHECK-LABEL: split_vector
138; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
139; CHECK: vector.body:
140; CHECK: %index = phi i32
141; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
142; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
143; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
144; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
145; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
146; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
147define dso_local arm_aapcs_vfpcc void @split_vector(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
148entry:
149  %cmp8 = icmp eq i32 %N, 0
150  %tmp8 = add i32 %N, 3
151  %tmp9 = lshr i32 %tmp8, 2
152  %tmp10 = shl nuw i32 %tmp9, 2
153  %tmp11 = add i32 %tmp10, -4
154  %tmp12 = lshr i32 %tmp11, 2
155  %tmp13 = add nuw nsw i32 %tmp12, 1
156  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
157
158vector.ph:                                        ; preds = %entry
159  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
160  br label %vector.body
161
162vector.body:                                      ; preds = %vector.body, %vector.ph
163  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
164  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
165  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
166  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
167  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
168  %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
169  %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
170  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
171  %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
172  %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
173  %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
174  %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low
175  %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high
176  %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
177  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
178  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %combine, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
179  %index.next = add i32 %index, 4
180  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
181  %tmp16 = icmp ne i32 %tmp15, 0
182  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
183
184for.cond.cleanup:                                 ; preds = %vector.body, %entry
185  ret void
186}
187
188; One of the loads now uses ult predicate.
189; CHECK-LABEL: mismatch_load_pred
190; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
191; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
192; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
193; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
194; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
195; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
196define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
197entry:
198  %cmp8 = icmp eq i32 %N, 0
199  %tmp8 = add i32 %N, 3
200  %tmp9 = lshr i32 %tmp8, 2
201  %tmp10 = shl nuw i32 %tmp9, 2
202  %tmp11 = add i32 %tmp10, -4
203  %tmp12 = lshr i32 %tmp11, 2
204  %tmp13 = add nuw nsw i32 %tmp12, 1
205  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
206
207vector.ph:                                        ; preds = %entry
208  %trip.count.minus.1 = add i32 %N, -1
209  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
210  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
211  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
212  br label %vector.body
213
214vector.body:                                      ; preds = %vector.body, %vector.ph
215  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
216  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
217  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
218  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
219  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
220  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
221  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
222  %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
223  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
224  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
225  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %wrong, <4 x i32> undef)
226  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
227  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
228  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
229  %index.next = add i32 %index, 4
230  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
231  %tmp16 = icmp ne i32 %tmp15, 0
232  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
233
234for.cond.cleanup:                                 ; preds = %vector.body, %entry
235  ret void
236}
237
238; The store now uses ult predicate.
239; CHECK-LABEL: mismatch_store_pred
240; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
241; CHECK: vector.body:
242; CHECK: %index = phi i32
243; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
244; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
245; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
246; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
247; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
248; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %wrong)
249define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
250entry:
251  %cmp8 = icmp eq i32 %N, 0
252  %tmp8 = add i32 %N, 3
253  %tmp9 = lshr i32 %tmp8, 2
254  %tmp10 = shl nuw i32 %tmp9, 2
255  %tmp11 = add i32 %tmp10, -4
256  %tmp12 = lshr i32 %tmp11, 2
257  %tmp13 = add nuw nsw i32 %tmp12, 1
258  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
259
260vector.ph:                                        ; preds = %entry
261  %trip.count.minus.1 = add i32 %N, -1
262  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
263  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
264  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
265  br label %vector.body
266
267vector.body:                                      ; preds = %vector.body, %vector.ph
268  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
269  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
270  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
271  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
272  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
273  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
274  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
275  %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
276  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
277  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
278  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
279  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
280  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
281  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %wrong)
282  %index.next = add i32 %index, 4
283  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
284  %tmp16 = icmp ne i32 %tmp15, 0
285  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
286
287for.cond.cleanup:                                 ; preds = %vector.body, %entry
288  ret void
289}
290
291; TODO: Multiple intrinsics not yet supported.
292; This is currently rejected, because if the vector body is unrolled, the step
293; is not what we expect:
294;
295;   Step value 16 doesn't match vector width 4
296;
297; CHECK-LABEL: interleave4
298; CHECK: vector.body:
299; CHECK:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
300; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
301; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
302; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
303;
304define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
305entry:
306  %cmp8 = icmp sgt i32 %N, 0
307  %v0 = add i32 %N, 15
308  %v1 = lshr i32 %v0, 4
309  %v2 = shl nuw i32 %v1, 4
310  %v3 = add i32 %v2, -16
311  %v4 = lshr i32 %v3, 4
312  %v5 = add nuw nsw i32 %v4, 1
313  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
314
315
316vector.ph:
317  %scevgep = getelementptr i32, ptr %A, i32 8
318  %scevgep30 = getelementptr i32, ptr %C, i32 8
319  %scevgep37 = getelementptr i32, ptr %B, i32 8
320  %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5)
321  br label %vector.body
322
323vector.body:
324  %lsr.iv38 = phi ptr [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
325  %lsr.iv31 = phi ptr [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
326  %lsr.iv = phi ptr [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
327  %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
328  %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ]
329  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
330  %v7 = add i32 %index, 4
331  %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
332  %v8 = add i32 %v7, 4
333  %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
334  %v9 = add i32 %v8, 4
335  %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
336  %scevgep42 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 -2
337  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
338  %scevgep43 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 -1
339  %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
340  %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %lsr.iv38, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
341  %scevgep41 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 1
342  %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
343  %scevgep34 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 -2
344  %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
345  %scevgep35 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 -1
346  %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
347  %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %lsr.iv31, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
348  %scevgep36 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 1
349  %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
350  %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
351  %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
352  %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
353  %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
354  %scevgep27 = getelementptr <4 x i32>, ptr %lsr.iv, i32 -2
355  call void @llvm.masked.store.v4i32.p0(<4 x i32> %v10, ptr %scevgep27, i32 4, <4 x i1> %active.lane.mask)
356  %scevgep28 = getelementptr <4 x i32>, ptr %lsr.iv, i32 -1
357  call void @llvm.masked.store.v4i32.p0(<4 x i32> %v11, ptr %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
358  call void @llvm.masked.store.v4i32.p0(<4 x i32> %v12, ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask16)
359  %scevgep29 = getelementptr <4 x i32>, ptr %lsr.iv, i32 1
360  call void @llvm.masked.store.v4i32.p0(<4 x i32> %v13, ptr %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
361  %scevgep25 = getelementptr i32, ptr %lsr.iv, i32 16
362  %scevgep32 = getelementptr i32, ptr %lsr.iv31, i32 16
363  %scevgep39 = getelementptr i32, ptr %lsr.iv38, i32 16
364  %v14 = add i32 %v9, 4
365  %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
366  %v16 = icmp ne i32 %v15, 0
367  br i1 %v16, label %vector.body, label %for.cond.cleanup
368
369for.cond.cleanup:
370  ret void
371}
372
373; CHECK-LABEL: const_expected_in_set_loop
374; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
375; CHECK-NOT:   vctp
376; CHECK:       ret void
377;
378define dso_local void @const_expected_in_set_loop(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
379entry:
380  %cmp8 = icmp sgt i32 %N, 0
381  %0 = add i32 %N, 3
382  %1 = lshr i32 %0, 2
383  %2 = shl nuw i32 %1, 2
384  %3 = add i32 %2, -4
385  %4 = lshr i32 %3, 2
386  %5 = add nuw nsw i32 %4, 1
387  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
388
389vector.ph:
390  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
391  br label %vector.body
392
393vector.body:                                      ; preds = %vector.body, %vector.ph
394  %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
395  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
396  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
397  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
398  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
399  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
400  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
401  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
402  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
403  call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
404  %index.next = add i32 %index, 4
405  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
406  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
407  %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
408  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
409  %9 = icmp ne i32 %8, 0
410  br i1 %9, label %vector.body, label %for.cond.cleanup
411
412for.cond.cleanup:                                 ; preds = %vector.body, %entry
413  ret void
414}
415
416; CHECK-LABEL: tripcount_arg_not_invariant
417; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
418; CHECK-NOT:   vctp
419; CHECK:       ret void
420;
421define dso_local void @tripcount_arg_not_invariant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
422entry:
423  %cmp8 = icmp sgt i32 %N, 0
424  %0 = add i32 %N, 3
425  %1 = lshr i32 %0, 2
426  %2 = shl nuw i32 %1, 2
427  %3 = add i32 %2, -4
428  %4 = lshr i32 %3, 2
429  %5 = add nuw nsw i32 %4, 1
430  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
431
432vector.ph:                                        ; preds = %entry
433  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
434  br label %vector.body
435
436vector.body:                                      ; preds = %vector.body, %vector.ph
437  %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
438  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
439  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
440  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
441  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
442
443  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
444  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
445  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
446  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
447  call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
448  %index.next = add i32 %index, 4
449  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
450  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
451  %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
452  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
453  %9 = icmp ne i32 %8, 0
454  ;br i1 %9, label %vector.body, label %for.cond.cleanup
455  br i1 %9, label %vector.body, label %vector.ph
456
457for.cond.cleanup:                                 ; preds = %vector.body, %entry
458  ret void
459}
460
461; CHECK-LABEL: addrec_base_not_zero
462; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
463; CHECK-NOT:   vctp
464; CHECK:       ret void
465;
466define dso_local void @addrec_base_not_zero(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
467entry:
468  %cmp8 = icmp sgt i32 %N, 0
469  %0 = add i32 %N, 3
470  %1 = lshr i32 %0, 2
471  %2 = shl nuw i32 %1, 2
472  %3 = add i32 %2, -4
473  %4 = lshr i32 %3, 2
474  %5 = add nuw nsw i32 %4, 1
475  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
476
477vector.ph:                                        ; preds = %entry
478  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
479  br label %vector.body
480
481vector.body:                                      ; preds = %vector.body, %vector.ph
482  %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
483  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
484  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
485
486; AddRec base is not 0:
487  %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]
488
489  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
490  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
491  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
492  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
493  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
494  call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
495  %index.next = add i32 %index, 4
496  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
497  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
498  %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
499  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
500  %9 = icmp ne i32 %8, 0
501  ;br i1 %9, label %vector.body, label %for.cond.cleanup
502  br i1 %9, label %vector.body, label %vector.ph
503
504for.cond.cleanup:                                 ; preds = %vector.body, %entry
505  ret void
506}
507
508
509declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
510declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
511declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
512declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
513declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
514declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32 immarg, <2 x i1>)
515declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32 immarg, <2 x i1>, <2 x i64>)
516declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
517declare i32 @llvm.start.loop.iterations.i32(i32)
518declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
519declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
520declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
521declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
522