xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
3
4; The following functions should all fail to become tail-predicated.
5; CHECK-NOT: call i32 @llvm.arm.vctp
6
7; trip.count.minus.1 has been inserted into element 1, not 0.
8define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
9entry:
10  %cmp8 = icmp eq i32 %N, 0
11  %tmp8 = add i32 %N, 3
12  %tmp9 = lshr i32 %tmp8, 2
13  %tmp10 = shl nuw i32 %tmp9, 2
14  %tmp11 = add i32 %tmp10, -4
15  %tmp12 = lshr i32 %tmp11, 2
16  %tmp13 = add nuw nsw i32 %tmp12, 1
17  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
18
19vector.ph:                                        ; preds = %entry
20  %trip.count.minus.1 = add i32 %N, -1
21  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
22  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
23  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
24  br label %vector.body
25
26vector.body:                                      ; preds = %vector.body, %vector.ph
27  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
28  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
29  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
30  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
31  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
32  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
33  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
34  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
35  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
36  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
37  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
38  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
39  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
40  %index.next = add i32 %index, 4
41  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
42  %tmp16 = icmp ne i32 %tmp15, 0
43  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
44
45for.cond.cleanup:                                 ; preds = %vector.body, %entry
46  ret void
47}
48
49; The insert isn't using an undef for operand 0.
50define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
51entry:
52  %cmp8 = icmp eq i32 %N, 0
53  %tmp8 = add i32 %N, 3
54  %tmp9 = lshr i32 %tmp8, 2
55  %tmp10 = shl nuw i32 %tmp9, 2
56  %tmp11 = add i32 %tmp10, -4
57  %tmp12 = lshr i32 %tmp11, 2
58  %tmp13 = add nuw nsw i32 %tmp12, 1
59  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
60
61vector.ph:                                        ; preds = %entry
62  %trip.count.minus.1 = add i32 %N, -1
63  %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
64  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
65  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
66  br label %vector.body
67
68vector.body:                                      ; preds = %vector.body, %vector.ph
69  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
70  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
71  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
72  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
73  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
74  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
75  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
76  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
77  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
78  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
79  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
80  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
81  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
82  %index.next = add i32 %index, 4
83  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
84  %tmp16 = icmp ne i32 %tmp15, 0
85  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
86
87for.cond.cleanup:                                 ; preds = %vector.body, %entry
88  ret void
89}
90
91; The shuffle uses a defined value for operand 1.
92define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
93entry:
94  %cmp8 = icmp eq i32 %N, 0
95  %tmp8 = add i32 %N, 3
96  %tmp9 = lshr i32 %tmp8, 2
97  %tmp10 = shl nuw i32 %tmp9, 2
98  %tmp11 = add i32 %tmp10, -4
99  %tmp12 = lshr i32 %tmp11, 2
100  %tmp13 = add nuw nsw i32 %tmp12, 1
101  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
102
103vector.ph:                                        ; preds = %entry
104  %trip.count.minus.1 = add i32 %N, -1
105  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
106  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
107  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
108  br label %vector.body
109
110vector.body:                                      ; preds = %vector.body, %vector.ph
111  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
112  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
113  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
114  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
115  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
116  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
117  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
118  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
119  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
120  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
121  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
122  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
123  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
124  %index.next = add i32 %index, 4
125  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
126  %tmp16 = icmp ne i32 %tmp15, 0
127  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
128
129for.cond.cleanup:                                 ; preds = %vector.body, %entry
130  ret void
131}
132
133; The shuffle uses a non zero value for operand 2.
134define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
135entry:
136  %cmp8 = icmp eq i32 %N, 0
137  %tmp8 = add i32 %N, 3
138  %tmp9 = lshr i32 %tmp8, 2
139  %tmp10 = shl nuw i32 %tmp9, 2
140  %tmp11 = add i32 %tmp10, -4
141  %tmp12 = lshr i32 %tmp11, 2
142  %tmp13 = add nuw nsw i32 %tmp12, 1
143  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
144
145vector.ph:                                        ; preds = %entry
146  %trip.count.minus.1 = add i32 %N, -1
147  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
148  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
149  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
150  br label %vector.body
151
152vector.body:                                      ; preds = %vector.body, %vector.ph
153  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
154  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
155  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
156  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
157  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
158  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
159  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
160  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
161  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
162  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
163  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
164  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
165  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
166  %index.next = add i32 %index, 4
167  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
168  %tmp16 = icmp ne i32 %tmp15, 0
169  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
170
171for.cond.cleanup:                                 ; preds = %vector.body, %entry
172  ret void
173}
174
175; %N - 2
176define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
177entry:
178  %cmp8 = icmp eq i32 %N, 0
179  %tmp8 = add i32 %N, 3
180  %tmp9 = lshr i32 %tmp8, 2
181  %tmp10 = shl nuw i32 %tmp9, 2
182  %tmp11 = add i32 %tmp10, -4
183  %tmp12 = lshr i32 %tmp11, 2
184  %tmp13 = add nuw nsw i32 %tmp12, 1
185  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
186
187vector.ph:                                        ; preds = %entry
188  %trip.count.minus.2 = add i32 %N, -2
189  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
190  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
191  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
192  br label %vector.body
193
194vector.body:                                      ; preds = %vector.body, %vector.ph
195  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
196  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
197  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
198  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
199  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
200  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
201  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
202  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
203  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
204  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
205  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
206  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
207  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
208  %index.next = add i32 %index, 4
209  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
210  %tmp16 = icmp ne i32 %tmp15, 0
211  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
212
213for.cond.cleanup:                                 ; preds = %vector.body, %entry
214  ret void
215}
216
217; index has been inserted at element 1, not 0.
218define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
219entry:
220  %cmp8 = icmp eq i32 %N, 0
221  %tmp8 = add i32 %N, 3
222  %tmp9 = lshr i32 %tmp8, 2
223  %tmp10 = shl nuw i32 %tmp9, 2
224  %tmp11 = add i32 %tmp10, -4
225  %tmp12 = lshr i32 %tmp11, 2
226  %tmp13 = add nuw nsw i32 %tmp12, 1
227  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
228
229vector.ph:                                        ; preds = %entry
230  %trip.count.minus.1 = add i32 %N, -1
231  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
232  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
233  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
234  br label %vector.body
235
236vector.body:                                      ; preds = %vector.body, %vector.ph
237  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
238  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
239  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
240  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
241  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
242  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
243  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
244  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
245  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
246  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
247  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
248  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
249  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
250  %index.next = add i32 %index, 4
251  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
252  %tmp16 = icmp ne i32 %tmp15, 0
253  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
254
255for.cond.cleanup:                                 ; preds = %vector.body, %entry
256  ret void
257}
258
259define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
260entry:
261  %cmp8 = icmp eq i32 %N, 0
262  %tmp8 = add i32 %N, 3
263  %tmp9 = lshr i32 %tmp8, 2
264  %tmp10 = shl nuw i32 %tmp9, 2
265  %tmp11 = add i32 %tmp10, -4
266  %tmp12 = lshr i32 %tmp11, 2
267  %tmp13 = add nuw nsw i32 %tmp12, 1
268  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
269
270vector.ph:                                        ; preds = %entry
271  %trip.count.minus.1 = add i32 %N, -1
272  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
273  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
274  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
275  br label %vector.body
276
277vector.body:                                      ; preds = %vector.body, %vector.ph
278  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
279  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
280  %incorrect = add i32 %index, 1
281  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
282  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
283  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
284  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
285  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
286  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
287  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
288  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
289  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
290  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
291  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
292  %index.next = add i32 %index, 4
293  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
294  %tmp16 = icmp ne i32 %tmp15, 0
295  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
296
297for.cond.cleanup:                                 ; preds = %vector.body, %entry
298  ret void
299}
300
301; Now using ult, not ule for the vector icmp
302define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
303entry:
304  %cmp8 = icmp eq i32 %N, 0
305  %tmp8 = add i32 %N, 3
306  %tmp9 = lshr i32 %tmp8, 2
307  %tmp10 = shl nuw i32 %tmp9, 2
308  %tmp11 = add i32 %tmp10, -4
309  %tmp12 = lshr i32 %tmp11, 2
310  %tmp13 = add nuw nsw i32 %tmp12, 1
311  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
312
313vector.ph:                                        ; preds = %entry
314  %trip.count.minus.1 = add i32 %N, -1
315  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
316  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
317  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
318  br label %vector.body
319
320vector.body:                                      ; preds = %vector.body, %vector.ph
321  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
322  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
323  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
324  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
325  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
326  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
327  %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
328  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
329  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
330  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
331  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
332  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
333  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
334  %index.next = add i32 %index, 4
335  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
336  %tmp16 = icmp ne i32 %tmp15, 0
337  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
338
339for.cond.cleanup:                                 ; preds = %vector.body, %entry
340  ret void
341}
342
343; The add in the body uses 1, 2, 3, 4
344define void @wrong_body_broadcast_splat(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
345entry:
346  %cmp8 = icmp eq i32 %N, 0
347  %tmp8 = add i32 %N, 3
348  %tmp9 = lshr i32 %tmp8, 2
349  %tmp10 = shl nuw i32 %tmp9, 2
350  %tmp11 = add i32 %tmp10, -4
351  %tmp12 = lshr i32 %tmp11, 2
352  %tmp13 = add nuw nsw i32 %tmp12, 1
353  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
354
355vector.ph:                                        ; preds = %entry
356  %trip.count.minus.1 = add i32 %N, -1
357  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
358  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
359  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
360  br label %vector.body
361
362vector.body:                                      ; preds = %vector.body, %vector.ph
363  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
364  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
365  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
366  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
367  %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
368  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
369  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
370  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
371  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
372  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
373  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
374  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
375  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
376  %index.next = add i32 %index, 4
377  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
378  %tmp16 = icmp ne i32 %tmp15, 0
379  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
380
381for.cond.cleanup:                                 ; preds = %vector.body, %entry
382  ret void
383}
384
385; Using a variable for the loop body broadcast.
386define void @wrong_body_broadcast_splat_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
387entry:
388  %cmp8 = icmp eq i32 %N, 0
389  %tmp8 = add i32 %N, 3
390  %tmp9 = lshr i32 %tmp8, 2
391  %tmp10 = shl nuw i32 %tmp9, 2
392  %tmp11 = add i32 %tmp10, -4
393  %tmp12 = lshr i32 %tmp11, 2
394  %tmp13 = add nuw nsw i32 %tmp12, 1
395  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
396
397vector.ph:                                        ; preds = %entry
398  %trip.count.minus.1 = add i32 %N, -1
399  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
400  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
401  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
402  br label %vector.body
403
404vector.body:                                      ; preds = %vector.body, %vector.ph
405  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
406  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
407  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
408  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
409  %induction = add <4 x i32> %broadcast.splat, %offsets
410  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
411  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
412  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
413  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
414  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
415  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
416  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
417  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
418  %index.next = add i32 %index, 4
419  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
420  %tmp16 = icmp ne i32 %tmp15, 0
421  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
422
423for.cond.cleanup:                                 ; preds = %vector.body, %entry
424  ret void
425}
426
427; adding 5, instead of 4, to index.
428define void @wrong_index_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
429entry:
430  %cmp8 = icmp eq i32 %N, 0
431  %tmp8 = add i32 %N, 3
432  %tmp9 = lshr i32 %tmp8, 2
433  %tmp10 = shl nuw i32 %tmp9, 2
434  %tmp11 = add i32 %tmp10, -4
435  %tmp12 = lshr i32 %tmp11, 2
436  %tmp13 = add nuw nsw i32 %tmp12, 1
437  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
438
439vector.ph:                                        ; preds = %entry
440  %trip.count.minus.1 = add i32 %N, -1
441  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
442  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
443  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
444  br label %vector.body
445
446vector.body:                                      ; preds = %vector.body, %vector.ph
447  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
448  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
449  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
450  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
451  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
452  %tmp = getelementptr inbounds i32, ptr %a, i32 %index
453  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
454  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
455  %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
456  %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
457  %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
458  %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
459  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
460  %index.next = add i32 %index, 5
461  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
462  %tmp16 = icmp ne i32 %tmp15, 0
463  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
464
465for.cond.cleanup:                                 ; preds = %vector.body, %entry
466  ret void
467}
468
469declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #1
470declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2
471declare i32 @llvm.start.loop.iterations.i32(i32) #3
472declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
473
474