xref: /llvm-project/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll (revision 29441e4f5fa5f5c7709f7cf180815ba97f611297)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2; RUN: opt -S < %s -passes=loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
5target triple = "aarch64"
6
7define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
8; CHECK-LABEL: define void @add_a
9; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) {
10; CHECK-NEXT:  entry:
11; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[LEN]], 0
12; CHECK-NEXT:    br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
13; CHECK:       iter.check:
14; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
15; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
16; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
17; CHECK:       vector.main.loop.iter.check:
18; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
19; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
20; CHECK:       vector.ph:
21; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
22; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
23; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
24; CHECK:       vector.body:
25; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
26; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
27; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
28; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
29; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
30; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i8> [[WIDE_LOAD]], splat (i8 2)
31; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP1]]
32; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
33; CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1
34; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
35; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
36; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
37; CHECK:       middle.block:
38; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
39; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
40; CHECK:       vec.epilog.iter.check:
41; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
42; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
43; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
44; CHECK:       vec.epilog.ph:
45; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
46; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
47; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
48; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
49; CHECK:       vec.epilog.vector.body:
50; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
51; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX4]], 0
52; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP8]]
53; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
54; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
55; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[WIDE_LOAD5]], splat (i8 2)
56; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP8]]
57; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
58; CHECK-NEXT:    store <4 x i8> [[TMP11]], ptr [[TMP13]], align 1
59; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
60; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
61; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
62; CHECK:       vec.epilog.middle.block:
63; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
64; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
65; CHECK:       vec.epilog.scalar.ph:
66; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
67; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
68; CHECK:       for.cond.cleanup.loopexit:
69; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
70; CHECK:       for.cond.cleanup:
71; CHECK-NEXT:    ret void
72; CHECK:       for.body:
73; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
74; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
75; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
76; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP15]] to i32
77; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
78; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i8
79; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
80; CHECK-NEXT:    store i8 [[CONV1]], ptr [[ARRAYIDX3]], align 1
81; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
82; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
83; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
84; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
85;
86entry:
87  %cmp8 = icmp sgt i32 %len, 0
88  br i1 %cmp8, label %for.body, label %for.cond.cleanup
89
90for.cond.cleanup:                                 ; preds = %for.body, %entry
91  ret void
92
93for.body:                                         ; preds = %entry, %for.body
94  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
95  %arrayidx = getelementptr inbounds i8, ptr %p, i64 %indvars.iv
96  %0 = load i8, ptr %arrayidx
97  %conv = zext i8 %0 to i32
98  %add = add nuw nsw i32 %conv, 2
99  %conv1 = trunc i32 %add to i8
100  %arrayidx3 = getelementptr inbounds i8, ptr %q, i64 %indvars.iv
101  store i8 %conv1, ptr %arrayidx3
102  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
103  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
104  %exitcond = icmp eq i32 %lftr.wideiv, %len
105  br i1 %exitcond, label %for.cond.cleanup, label %for.body
106}
107
108; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
109; working with.
110define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
111; CHECK-LABEL: define void @add_a1
112; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) {
113; CHECK-NEXT:  entry:
114; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[LEN]], 0
115; CHECK-NEXT:    br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
116; CHECK:       iter.check:
117; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
118; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
119; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
120; CHECK:       vector.main.loop.iter.check:
121; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
122; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
123; CHECK:       vector.ph:
124; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
125; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
126; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
127; CHECK:       vector.body:
128; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
129; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
130; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
131; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
132; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
133; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <16 x i8> [[WIDE_LOAD]], splat (i8 2)
134; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP1]]
135; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
136; CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1
137; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
138; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
139; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
140; CHECK:       middle.block:
141; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
142; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
143; CHECK:       vec.epilog.iter.check:
144; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
145; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
146; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
147; CHECK:       vec.epilog.ph:
148; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
149; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
150; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
151; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
152; CHECK:       vec.epilog.vector.body:
153; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
154; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX4]], 0
155; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP8]]
156; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
157; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
158; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw <4 x i8> [[WIDE_LOAD5]], splat (i8 2)
159; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP8]]
160; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
161; CHECK-NEXT:    store <4 x i8> [[TMP11]], ptr [[TMP13]], align 1
162; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
163; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
164; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
165; CHECK:       vec.epilog.middle.block:
166; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
167; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
168; CHECK:       vec.epilog.scalar.ph:
169; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
170; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
171; CHECK:       for.cond.cleanup.loopexit:
172; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
173; CHECK:       for.cond.cleanup:
174; CHECK-NEXT:    ret void
175; CHECK:       for.body:
176; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
177; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
178; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
179; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i8 [[TMP15]], 2
180; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
181; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX3]], align 1
182; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
183; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
184; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
185; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
186;
187entry:
188  %cmp8 = icmp sgt i32 %len, 0
189  br i1 %cmp8, label %for.body, label %for.cond.cleanup
190
191for.cond.cleanup:                                 ; preds = %for.body, %entry
192  ret void
193
194for.body:                                         ; preds = %entry, %for.body
195  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
196  %arrayidx = getelementptr inbounds i8, ptr %p, i64 %indvars.iv
197  %0 = load i8, ptr %arrayidx
198  %add = add nuw nsw i8 %0, 2
199  %arrayidx3 = getelementptr inbounds i8, ptr %q, i64 %indvars.iv
200  store i8 %add, ptr %arrayidx3
201  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
202  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
203  %exitcond = icmp eq i32 %lftr.wideiv, %len
204  br i1 %exitcond, label %for.cond.cleanup, label %for.body
205}
206
207define void @add_b(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
208; CHECK-LABEL: define void @add_b
209; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) {
210; CHECK-NEXT:  entry:
211; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[LEN]], 0
212; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
213; CHECK:       for.body.preheader:
214; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
215; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
216; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
217; CHECK:       vector.ph:
218; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
219; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
220; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
221; CHECK:       vector.body:
222; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
223; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
224; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP1]]
225; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
226; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
227; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[WIDE_LOAD]], splat (i16 2)
228; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]]
229; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
230; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP6]], align 2
231; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
232; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
233; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
234; CHECK:       middle.block:
235; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
236; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
237; CHECK:       scalar.ph:
238; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
239; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
240; CHECK:       for.cond.cleanup.loopexit:
241; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
242; CHECK:       for.cond.cleanup:
243; CHECK-NEXT:    ret void
244; CHECK:       for.body:
245; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
246; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]]
247; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
248; CHECK-NEXT:    [[CONV8:%.*]] = zext i16 [[TMP8]] to i32
249; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV8]], 2
250; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
251; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]]
252; CHECK-NEXT:    store i16 [[CONV1]], ptr [[ARRAYIDX3]], align 2
253; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
254; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
255; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
256; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
257;
258entry:
259  %cmp9 = icmp sgt i32 %len, 0
260  br i1 %cmp9, label %for.body, label %for.cond.cleanup
261
262for.cond.cleanup:                                 ; preds = %for.body, %entry
263  ret void
264
265for.body:                                         ; preds = %entry, %for.body
266  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
267  %arrayidx = getelementptr inbounds i16, ptr %p, i64 %indvars.iv
268  %0 = load i16, ptr %arrayidx
269  %conv8 = zext i16 %0 to i32
270  %add = add nuw nsw i32 %conv8, 2
271  %conv1 = trunc i32 %add to i16
272  %arrayidx3 = getelementptr inbounds i16, ptr %q, i64 %indvars.iv
273  store i16 %conv1, ptr %arrayidx3
274  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
275  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
276  %exitcond = icmp eq i32 %lftr.wideiv, %len
277  br i1 %exitcond, label %for.cond.cleanup, label %for.body
278}
279
280define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
281; CHECK-LABEL: define void @add_c
282; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) {
283; CHECK-NEXT:  entry:
284; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[LEN]], 0
285; CHECK-NEXT:    br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
286; CHECK:       iter.check:
287; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
288; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
289; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
290; CHECK:       vector.main.loop.iter.check:
291; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
292; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
293; CHECK:       vector.ph:
294; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
295; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
296; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
297; CHECK:       vector.body:
298; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
299; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
300; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
301; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
302; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
303; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
304; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP4]], splat (i16 2)
305; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP1]]
306; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
307; CHECK-NEXT:    store <16 x i16> [[TMP5]], ptr [[TMP7]], align 2
308; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
309; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
310; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
311; CHECK:       middle.block:
312; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
313; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
314; CHECK:       vec.epilog.iter.check:
315; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
316; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
317; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
318; CHECK:       vec.epilog.ph:
319; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
320; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
321; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
322; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
323; CHECK:       vec.epilog.vector.body:
324; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
325; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX4]], 0
326; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]]
327; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
328; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
329; CHECK-NEXT:    [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD5]] to <4 x i16>
330; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i16> [[TMP12]], splat (i16 2)
331; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP9]]
332; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP14]], i32 0
333; CHECK-NEXT:    store <4 x i16> [[TMP13]], ptr [[TMP15]], align 2
334; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
335; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
336; CHECK-NEXT:    br i1 [[TMP16]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
337; CHECK:       vec.epilog.middle.block:
338; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
339; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
340; CHECK:       vec.epilog.scalar.ph:
341; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
342; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
343; CHECK:       for.cond.cleanup.loopexit:
344; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
345; CHECK:       for.cond.cleanup:
346; CHECK-NEXT:    ret void
347; CHECK:       for.body:
348; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
349; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
350; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
351; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP17]] to i32
352; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
353; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
354; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]]
355; CHECK-NEXT:    store i16 [[CONV1]], ptr [[ARRAYIDX3]], align 2
356; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
357; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
358; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
359; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
360;
361entry:
362  %cmp8 = icmp sgt i32 %len, 0
363  br i1 %cmp8, label %for.body, label %for.cond.cleanup
364
365for.cond.cleanup:                                 ; preds = %for.body, %entry
366  ret void
367
368for.body:                                         ; preds = %entry, %for.body
369  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
370  %arrayidx = getelementptr inbounds i8, ptr %p, i64 %indvars.iv
371  %0 = load i8, ptr %arrayidx
372  %conv = zext i8 %0 to i32
373  %add = add nuw nsw i32 %conv, 2
374  %conv1 = trunc i32 %add to i16
375  %arrayidx3 = getelementptr inbounds i16, ptr %q, i64 %indvars.iv
376  store i16 %conv1, ptr %arrayidx3
377  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
378  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
379  %exitcond = icmp eq i32 %lftr.wideiv, %len
380  br i1 %exitcond, label %for.cond.cleanup, label %for.body
381}
382
383define void @add_d(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
384; CHECK-LABEL: define void @add_d
385; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) {
386; CHECK-NEXT:  entry:
387; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[LEN]], 0
388; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
389; CHECK:       for.body.preheader:
390; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
391; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
392; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
393; CHECK:       vector.ph:
394; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
395; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
396; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
397; CHECK:       vector.body:
398; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
399; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
400; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP1]]
401; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
402; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
403; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
404; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[TMP4]], splat (i32 2)
405; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[TMP1]]
406; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
407; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP7]], align 4
408; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
409; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
410; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
411; CHECK:       middle.block:
412; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
413; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
414; CHECK:       scalar.ph:
415; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
416; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
417; CHECK:       for.cond.cleanup.loopexit:
418; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
419; CHECK:       for.cond.cleanup:
420; CHECK-NEXT:    ret void
421; CHECK:       for.body:
422; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
423; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]]
424; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
425; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
426; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 2
427; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[INDVARS_IV]]
428; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
429; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
430; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
431; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
432; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
433;
434entry:
435  %cmp7 = icmp sgt i32 %len, 0
436  br i1 %cmp7, label %for.body, label %for.cond.cleanup
437
438for.cond.cleanup:                                 ; preds = %for.body, %entry
439  ret void
440
441for.body:                                         ; preds = %entry, %for.body
442  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
443  %arrayidx = getelementptr inbounds i16, ptr %p, i64 %indvars.iv
444  %0 = load i16, ptr %arrayidx
445  %conv = sext i16 %0 to i32
446  %add = add nsw i32 %conv, 2
447  %arrayidx2 = getelementptr inbounds i32, ptr %q, i64 %indvars.iv
448  store i32 %add, ptr %arrayidx2
449  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
450  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
451  %exitcond = icmp eq i32 %lftr.wideiv, %len
452  br i1 %exitcond, label %for.cond.cleanup, label %for.body
453}
454
455define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
456; CHECK-LABEL: define void @add_e
457; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 [[ARG1:%.*]], i8 [[ARG2:%.*]], i32 [[LEN:%.*]]) {
458; CHECK-NEXT:  entry:
459; CHECK-NEXT:    [[CMP_32:%.*]] = icmp sgt i32 [[LEN]], 0
460; CHECK-NEXT:    br i1 [[CMP_32]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
461; CHECK:       iter.check:
462; CHECK-NEXT:    [[CONV11:%.*]] = zext i8 [[ARG2]] to i32
463; CHECK-NEXT:    [[CONV13:%.*]] = zext i8 [[ARG1]] to i32
464; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
465; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
466; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
467; CHECK:       vector.main.loop.iter.check:
468; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
469; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
470; CHECK:       vector.ph:
471; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
472; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
473; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV13]], i64 0
474; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
475; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i8>
476; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[CONV11]], i64 0
477; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer
478; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT3]] to <16 x i8>
479; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
480; CHECK:       vector.body:
481; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
482; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
483; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP3]]
484; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
485; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
486; CHECK-NEXT:    [[TMP6:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 4)
487; CHECK-NEXT:    [[TMP7:%.*]] = add <16 x i8> [[TMP6]], splat (i8 32)
488; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i8> [[WIDE_LOAD]], splat (i8 51)
489; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP8]], splat (i8 60)
490; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i8> [[TMP7]], [[TMP1]]
491; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i8> [[TMP9]], splat (i8 -4)
492; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i8> [[TMP11]], [[TMP2]]
493; CHECK-NEXT:    [[TMP13:%.*]] = mul <16 x i8> [[TMP12]], [[TMP10]]
494; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]]
495; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
496; CHECK-NEXT:    store <16 x i8> [[TMP13]], ptr [[TMP15]], align 1
497; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
498; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
499; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
500; CHECK:       middle.block:
501; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
502; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
503; CHECK:       vec.epilog.iter.check:
504; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
505; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
506; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
507; CHECK:       vec.epilog.ph:
508; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
509; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 4
510; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
511; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV13]], i64 0
512; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
513; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT7]] to <4 x i8>
514; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV11]], i64 0
515; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
516; CHECK-NEXT:    [[TMP18:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT9]] to <4 x i8>
517; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
518; CHECK:       vec.epilog.vector.body:
519; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
520; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX10]], 0
521; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP19]]
522; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i32 0
523; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
524; CHECK-NEXT:    [[TMP22:%.*]] = shl <4 x i8> [[WIDE_LOAD11]], splat (i8 4)
525; CHECK-NEXT:    [[TMP23:%.*]] = add <4 x i8> [[TMP22]], splat (i8 32)
526; CHECK-NEXT:    [[TMP24:%.*]] = or <4 x i8> [[WIDE_LOAD11]], splat (i8 51)
527; CHECK-NEXT:    [[TMP25:%.*]] = mul <4 x i8> [[TMP24]], splat (i8 60)
528; CHECK-NEXT:    [[TMP26:%.*]] = and <4 x i8> [[TMP23]], [[TMP17]]
529; CHECK-NEXT:    [[TMP27:%.*]] = and <4 x i8> [[TMP25]], splat (i8 -4)
530; CHECK-NEXT:    [[TMP28:%.*]] = xor <4 x i8> [[TMP27]], [[TMP18]]
531; CHECK-NEXT:    [[TMP29:%.*]] = mul <4 x i8> [[TMP28]], [[TMP26]]
532; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP19]]
533; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
534; CHECK-NEXT:    store <4 x i8> [[TMP29]], ptr [[TMP31]], align 1
535; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 4
536; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]]
537; CHECK-NEXT:    br i1 [[TMP32]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
538; CHECK:       vec.epilog.middle.block:
539; CHECK-NEXT:    [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
540; CHECK-NEXT:    br i1 [[CMP_N13]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
541; CHECK:       vec.epilog.scalar.ph:
542; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
543; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
544; CHECK:       for.cond.cleanup.loopexit:
545; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
546; CHECK:       for.cond.cleanup:
547; CHECK-NEXT:    ret void
548; CHECK:       for.body:
549; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
550; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
551; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
552; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP33]] to i32
553; CHECK-NEXT:    [[ADD:%.*]] = shl i32 [[CONV]], 4
554; CHECK-NEXT:    [[CONV2:%.*]] = add nuw nsw i32 [[ADD]], 32
555; CHECK-NEXT:    [[OR:%.*]] = or i32 [[CONV]], 51
556; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[OR]], 60
557; CHECK-NEXT:    [[AND:%.*]] = and i32 [[CONV2]], [[CONV13]]
558; CHECK-NEXT:    [[MUL_MASKED:%.*]] = and i32 [[MUL]], 252
559; CHECK-NEXT:    [[CONV17:%.*]] = xor i32 [[MUL_MASKED]], [[CONV11]]
560; CHECK-NEXT:    [[MUL18:%.*]] = mul nuw nsw i32 [[CONV17]], [[AND]]
561; CHECK-NEXT:    [[CONV19:%.*]] = trunc i32 [[MUL18]] to i8
562; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
563; CHECK-NEXT:    store i8 [[CONV19]], ptr [[ARRAYIDX21]], align 1
564; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
565; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
566; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
567; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
568;
569entry:
570  %cmp.32 = icmp sgt i32 %len, 0
571  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
572
573for.body.lr.ph:                                   ; preds = %entry
574  %conv11 = zext i8 %arg2 to i32
575  %conv13 = zext i8 %arg1 to i32
576  br label %for.body
577
578for.cond.cleanup:                                 ; preds = %for.body, %entry
579  ret void
580
581for.body:                                         ; preds = %for.body, %for.body.lr.ph
582  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
583  %arrayidx = getelementptr inbounds i8, ptr %p, i64 %indvars.iv
584  %0 = load i8, ptr %arrayidx
585  %conv = zext i8 %0 to i32
586  %add = shl i32 %conv, 4
587  %conv2 = add nuw nsw i32 %add, 32
588  %or = or i32 %conv, 51
589  %mul = mul nuw nsw i32 %or, 60
590  %and = and i32 %conv2, %conv13
591  %mul.masked = and i32 %mul, 252
592  %conv17 = xor i32 %mul.masked, %conv11
593  %mul18 = mul nuw nsw i32 %conv17, %and
594  %conv19 = trunc i32 %mul18 to i8
595  %arrayidx21 = getelementptr inbounds i8, ptr %q, i64 %indvars.iv
596  store i8 %conv19, ptr %arrayidx21
597  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
598  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
599  %exitcond = icmp eq i32 %lftr.wideiv, %len
600  br i1 %exitcond, label %for.cond.cleanup, label %for.body
601}
602
603define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
604; CHECK-LABEL: define void @add_f
605; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 [[ARG1:%.*]], i8 [[ARG2:%.*]], i32 [[LEN:%.*]]) {
606; CHECK-NEXT:  entry:
607; CHECK-NEXT:    [[CMP_32:%.*]] = icmp sgt i32 [[LEN]], 0
608; CHECK-NEXT:    br i1 [[CMP_32]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
609; CHECK:       iter.check:
610; CHECK-NEXT:    [[CONV11:%.*]] = zext i8 [[ARG2]] to i32
611; CHECK-NEXT:    [[CONV13:%.*]] = zext i8 [[ARG1]] to i32
612; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
613; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
614; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
615; CHECK:       vector.main.loop.iter.check:
616; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
617; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
618; CHECK:       vector.ph:
619; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
620; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
621; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV13]], i64 0
622; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
623; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i8>
624; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[CONV11]], i64 0
625; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer
626; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT3]] to <16 x i8>
627; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
628; CHECK:       vector.body:
629; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
630; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
631; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP3]]
632; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
633; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2
634; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8>
635; CHECK-NEXT:    [[TMP7:%.*]] = shl <16 x i8> [[TMP6]], splat (i8 4)
636; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i8> [[TMP7]], splat (i8 32)
637; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP6]], splat (i8 -52)
638; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP9]], splat (i8 51)
639; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP10]], splat (i8 60)
640; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i8> [[TMP8]], [[TMP1]]
641; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i8> [[TMP11]], splat (i8 -4)
642; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i8> [[TMP13]], [[TMP2]]
643; CHECK-NEXT:    [[TMP15:%.*]] = mul <16 x i8> [[TMP14]], [[TMP12]]
644; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]]
645; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
646; CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[TMP17]], align 1
647; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
648; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
649; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
650; CHECK:       middle.block:
651; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
652; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
653; CHECK:       vec.epilog.iter.check:
654; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
655; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
656; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
657; CHECK:       vec.epilog.ph:
658; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
659; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 4
660; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
661; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV13]], i64 0
662; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
663; CHECK-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT7]] to <4 x i8>
664; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV11]], i64 0
665; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
666; CHECK-NEXT:    [[TMP20:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT9]] to <4 x i8>
667; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
668; CHECK:       vec.epilog.vector.body:
669; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
670; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX10]], 0
671; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP21]]
672; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0
673; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i16>, ptr [[TMP23]], align 2
674; CHECK-NEXT:    [[TMP24:%.*]] = trunc <4 x i16> [[WIDE_LOAD11]] to <4 x i8>
675; CHECK-NEXT:    [[TMP25:%.*]] = shl <4 x i8> [[TMP24]], splat (i8 4)
676; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i8> [[TMP25]], splat (i8 32)
677; CHECK-NEXT:    [[TMP27:%.*]] = and <4 x i8> [[TMP24]], splat (i8 -52)
678; CHECK-NEXT:    [[TMP28:%.*]] = or <4 x i8> [[TMP27]], splat (i8 51)
679; CHECK-NEXT:    [[TMP29:%.*]] = mul <4 x i8> [[TMP28]], splat (i8 60)
680; CHECK-NEXT:    [[TMP30:%.*]] = and <4 x i8> [[TMP26]], [[TMP19]]
681; CHECK-NEXT:    [[TMP31:%.*]] = and <4 x i8> [[TMP29]], splat (i8 -4)
682; CHECK-NEXT:    [[TMP32:%.*]] = xor <4 x i8> [[TMP31]], [[TMP20]]
683; CHECK-NEXT:    [[TMP33:%.*]] = mul <4 x i8> [[TMP32]], [[TMP30]]
684; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP21]]
685; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i32 0
686; CHECK-NEXT:    store <4 x i8> [[TMP33]], ptr [[TMP35]], align 1
687; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 4
688; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]]
689; CHECK-NEXT:    br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
690; CHECK:       vec.epilog.middle.block:
691; CHECK-NEXT:    [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
692; CHECK-NEXT:    br i1 [[CMP_N13]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
693; CHECK:       vec.epilog.scalar.ph:
694; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
695; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
696; CHECK:       for.cond.cleanup.loopexit:
697; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
698; CHECK:       for.cond.cleanup:
699; CHECK-NEXT:    ret void
700; CHECK:       for.body:
701; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
702; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]]
703; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
704; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP37]] to i32
705; CHECK-NEXT:    [[ADD:%.*]] = shl i32 [[CONV]], 4
706; CHECK-NEXT:    [[CONV2:%.*]] = add nsw i32 [[ADD]], 32
707; CHECK-NEXT:    [[OR:%.*]] = and i32 [[CONV]], 204
708; CHECK-NEXT:    [[CONV8:%.*]] = or i32 [[OR]], 51
709; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV8]], 60
710; CHECK-NEXT:    [[AND:%.*]] = and i32 [[CONV2]], [[CONV13]]
711; CHECK-NEXT:    [[MUL_MASKED:%.*]] = and i32 [[MUL]], 252
712; CHECK-NEXT:    [[CONV17:%.*]] = xor i32 [[MUL_MASKED]], [[CONV11]]
713; CHECK-NEXT:    [[MUL18:%.*]] = mul nuw nsw i32 [[CONV17]], [[AND]]
714; CHECK-NEXT:    [[CONV19:%.*]] = trunc i32 [[MUL18]] to i8
715; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
716; CHECK-NEXT:    store i8 [[CONV19]], ptr [[ARRAYIDX21]], align 1
717; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
718; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
719; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
720; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
721;
722entry:
723  %cmp.32 = icmp sgt i32 %len, 0
724  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
725
726for.body.lr.ph:                                   ; preds = %entry
727  %conv11 = zext i8 %arg2 to i32
728  %conv13 = zext i8 %arg1 to i32
729  br label %for.body
730
731for.cond.cleanup:                                 ; preds = %for.body, %entry
732  ret void
733
734for.body:                                         ; preds = %for.body, %for.body.lr.ph
735  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
736  %arrayidx = getelementptr inbounds i16, ptr %p, i64 %indvars.iv
737  %0 = load i16, ptr %arrayidx
738  %conv = sext i16 %0 to i32
739  %add = shl i32 %conv, 4
740  %conv2 = add nsw i32 %add, 32
741  %or = and i32 %conv, 204
742  %conv8 = or i32 %or, 51
743  %mul = mul nuw nsw i32 %conv8, 60
744  %and = and i32 %conv2, %conv13
745  %mul.masked = and i32 %mul, 252
746  %conv17 = xor i32 %mul.masked, %conv11
747  %mul18 = mul nuw nsw i32 %conv17, %and
748  %conv19 = trunc i32 %mul18 to i8
749  %arrayidx21 = getelementptr inbounds i8, ptr %q, i64 %indvars.iv
750  store i8 %conv19, ptr %arrayidx21
751  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
752  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
753  %exitcond = icmp eq i32 %lftr.wideiv, %len
754  br i1 %exitcond, label %for.cond.cleanup, label %for.body
755}
756
757define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
758; CHECK-LABEL: define void @add_phifail
759; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) {
760; CHECK-NEXT:  entry:
761; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[LEN]], 0
762; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
763; CHECK:       for.body.preheader:
764; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
765; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
766; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
767; CHECK:       vector.ph:
768; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
769; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
770; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
771; CHECK:       vector.body:
772; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
773; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
774; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
775; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
776; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
777; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
778; CHECK-NEXT:    [[TMP4]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
779; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <16 x i32> [[TMP4]], splat (i32 2)
780; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[TMP5]] to <16 x i8>
781; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP1]]
782; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
783; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP8]], align 1
784; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
785; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
786; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
787; CHECK:       middle.block:
788; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 15
789; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
790; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
791; CHECK:       scalar.ph:
792; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
793; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
794; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
795; CHECK:       for.cond.cleanup.loopexit:
796; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
797; CHECK:       for.cond.cleanup:
798; CHECK-NEXT:    ret void
799; CHECK:       for.body:
800; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
801; CHECK-NEXT:    [[A_PHI:%.*]] = phi i32 [ [[CONV:%.*]], [[FOR_BODY]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
802; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
803; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
804; CHECK-NEXT:    [[CONV]] = zext i8 [[TMP10]] to i32
805; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
806; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i8
807; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
808; CHECK-NEXT:    store i8 [[CONV1]], ptr [[ARRAYIDX3]], align 1
809; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
810; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
811; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
812; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
813;
814entry:
815  %cmp8 = icmp sgt i32 %len, 0
816  br i1 %cmp8, label %for.body, label %for.cond.cleanup
817
818for.cond.cleanup:                                 ; preds = %for.body, %entry
819  ret void
820
821for.body:                                         ; preds = %entry, %for.body
822  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
823  %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
824  %arrayidx = getelementptr inbounds i8, ptr %p, i64 %indvars.iv
825  %0 = load i8, ptr %arrayidx
826  %conv = zext i8 %0 to i32
827  %add = add nuw nsw i32 %conv, 2
828  %conv1 = trunc i32 %add to i8
829  %arrayidx3 = getelementptr inbounds i8, ptr %q, i64 %indvars.iv
830  store i8 %conv1, ptr %arrayidx3
831  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
832  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
833  %exitcond = icmp eq i32 %lftr.wideiv, %len
834  br i1 %exitcond, label %for.cond.cleanup, label %for.body
835}
836
837; When we vectorize this loop, we generate correct code
838; even when %len exactly divides VF (since we extract from the second last index
839; and pass this to the for.cond.cleanup block). Vectorized loop returns
840; the correct value a_phi = p[len -2]
841define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
842; CHECK-LABEL: define i8 @add_phifail2
843; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) {
844; CHECK-NEXT:  entry:
845; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], -1
846; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
847; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
848; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16
849; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
850; CHECK:       vector.ph:
851; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16
852; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
853; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
854; CHECK:       vector.body:
855; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
856; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
857; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
858; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP3]]
859; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
860; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
861; CHECK-NEXT:    [[TMP6]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
862; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw <16 x i32> [[TMP6]], splat (i32 2)
863; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i8>
864; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]]
865; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
866; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP10]], align 1
867; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
868; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
869; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
870; CHECK:       middle.block:
871; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
872; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
873; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
874; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
875; CHECK:       scalar.ph:
876; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
877; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
878; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
879; CHECK:       for.cond.cleanup:
880; CHECK-NEXT:    [[A_PHI_LCSSA:%.*]] = phi i32 [ [[A_PHI:%.*]], [[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
881; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[A_PHI_LCSSA]] to i8
882; CHECK-NEXT:    ret i8 [[RET]]
883; CHECK:       for.body:
884; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
885; CHECK-NEXT:    [[A_PHI]] = phi i32 [ [[CONV:%.*]], [[FOR_BODY]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
886; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
887; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
888; CHECK-NEXT:    [[CONV]] = zext i8 [[TMP12]] to i32
889; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
890; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i8
891; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
892; CHECK-NEXT:    store i8 [[CONV1]], ptr [[ARRAYIDX3]], align 1
893; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
894; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
895; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]]
896; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
897;
898entry:
899  br label %for.body
900
901for.cond.cleanup:                                 ; preds = %for.body, %entry
902  %ret = trunc i32 %a_phi to i8
903  ret i8 %ret
904
905for.body:                                         ; preds = %entry, %for.body
906  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
907  %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
908  %arrayidx = getelementptr inbounds i8, ptr %p, i64 %indvars.iv
909  %0 = load i8, ptr %arrayidx
910  %conv = zext i8 %0 to i32
911  %add = add nuw nsw i32 %conv, 2
912  %conv1 = trunc i32 %add to i8
913  %arrayidx3 = getelementptr inbounds i8, ptr %q, i64 %indvars.iv
914  store i8 %conv1, ptr %arrayidx3
915  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
916  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
917  %exitcond = icmp eq i32 %lftr.wideiv, %len
918  br i1 %exitcond, label %for.cond.cleanup, label %for.body
919}
920