xref: /llvm-project/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll (revision 4ad0fdd1631eeae432714c03ede01a10dc00025d)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S < %s -passes=loop-vectorize -force-vector-interleave=4 2>&1 | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
5target triple = "aarch64"
6
7define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) {
8; CHECK-LABEL: @add_i8(
9; CHECK-NEXT:  iter.check:
10; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 8
11; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
12; CHECK:       vector.main.loop.iter.check:
13; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 64
14; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
15; CHECK:       vector.ph:
16; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 64
17; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]]
18; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
19; CHECK:       vector.body:
20; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
21; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
22; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[TMP0]]
23; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
24; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
25; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 32
26; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 48
27; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
28; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
29; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
30; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
31; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 [[TMP0]]
32; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
33; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
34; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 32
35; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 48
36; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
37; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
38; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
39; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
40; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD]]
41; CHECK-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
42; CHECK-NEXT:    [[TMP13:%.*]] = add <16 x i8> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
43; CHECK-NEXT:    [[TMP14:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
44; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]]
45; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
46; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 16
47; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 32
48; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 48
49; CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP16]], align 1
50; CHECK-NEXT:    store <16 x i8> [[TMP12]], ptr [[TMP17]], align 1
51; CHECK-NEXT:    store <16 x i8> [[TMP13]], ptr [[TMP18]], align 1
52; CHECK-NEXT:    store <16 x i8> [[TMP14]], ptr [[TMP19]], align 1
53; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
54; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
55; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
56; CHECK:       middle.block:
57; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]]
58; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
59; CHECK:       vec.epilog.iter.check:
60; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]]
61; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
62; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
63; CHECK:       vec.epilog.ph:
64; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
65; CHECK-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 8
66; CHECK-NEXT:    [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]]
67; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
68; CHECK:       vec.epilog.vector.body:
69; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
70; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX11]], 0
71; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP21]]
72; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
73; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP23]], align 1
74; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP21]]
75; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0
76; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP25]], align 1
77; CHECK-NEXT:    [[TMP26:%.*]] = add <8 x i8> [[WIDE_LOAD13]], [[WIDE_LOAD12]]
78; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP21]]
79; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
80; CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP28]], align 1
81; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
82; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]]
83; CHECK-NEXT:    br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
84; CHECK:       vec.epilog.middle.block:
85; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]]
86; CHECK-NEXT:    br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
87; CHECK:       vec.epilog.scalar.ph:
88; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
89; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
90; CHECK:       for.body:
91; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
92; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
93; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
94; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[IV]]
95; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
96; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP31]], [[TMP30]]
97; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
98; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
99; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
100; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]]
101; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
102; CHECK:       exit:
103; CHECK-NEXT:    ret void
104;
105entry:
106  br label %for.body
107
108for.body:
109  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
110  %arrayidx = getelementptr inbounds i8, ptr %B, i64 %iv
111  %0 = load i8, ptr %arrayidx, align 1
112  %arrayidx2 = getelementptr inbounds i8, ptr %C, i64 %iv
113  %1 = load i8, ptr %arrayidx2, align 1
114  %add = add i8 %1, %0
115  %arrayidx6 = getelementptr inbounds i8, ptr %A, i64 %iv
116  store i8 %add, ptr %arrayidx6, align 1
117  %iv.next = add nuw nsw i64 %iv, 1
118  %exitcond.not = icmp eq i64 %iv.next, %Iterations
119  br i1 %exitcond.not, label %exit, label %for.body
120
121exit:
122  ret void
123}
124
125define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) {
126; CHECK-LABEL: @add_i16(
127; CHECK-NEXT:  iter.check:
128; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 4
129; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
130; CHECK:       vector.main.loop.iter.check:
131; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 32
132; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
133; CHECK:       vector.ph:
134; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 32
135; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]]
136; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
137; CHECK:       vector.body:
138; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
139; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
140; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 [[TMP0]]
141; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
142; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 8
143; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 16
144; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 24
145; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 1
146; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 1
147; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[TMP4]], align 1
148; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP5]], align 1
149; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C:%.*]], i64 [[TMP0]]
150; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
151; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 8
152; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 16
153; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 24
154; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1
155; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP8]], align 1
156; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP9]], align 1
157; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP10]], align 1
158; CHECK-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[WIDE_LOAD5]], [[WIDE_LOAD]]
159; CHECK-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
160; CHECK-NEXT:    [[TMP13:%.*]] = add <8 x i16> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
161; CHECK-NEXT:    [[TMP14:%.*]] = add <8 x i16> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
162; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[A:%.*]], i64 [[TMP0]]
163; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0
164; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 8
165; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 16
166; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 24
167; CHECK-NEXT:    store <8 x i16> [[TMP11]], ptr [[TMP16]], align 1
168; CHECK-NEXT:    store <8 x i16> [[TMP12]], ptr [[TMP17]], align 1
169; CHECK-NEXT:    store <8 x i16> [[TMP13]], ptr [[TMP18]], align 1
170; CHECK-NEXT:    store <8 x i16> [[TMP14]], ptr [[TMP19]], align 1
171; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
172; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
173; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
174; CHECK:       middle.block:
175; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]]
176; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
177; CHECK:       vec.epilog.iter.check:
178; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]]
179; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
180; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
181; CHECK:       vec.epilog.ph:
182; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
183; CHECK-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 4
184; CHECK-NEXT:    [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]]
185; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
186; CHECK:       vec.epilog.vector.body:
187; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
188; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX11]], 0
189; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[TMP21]]
190; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0
191; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i16>, ptr [[TMP23]], align 1
192; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[TMP21]]
193; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 0
194; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i16>, ptr [[TMP25]], align 1
195; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i16> [[WIDE_LOAD13]], [[WIDE_LOAD12]]
196; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP21]]
197; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i16, ptr [[TMP27]], i32 0
198; CHECK-NEXT:    store <4 x i16> [[TMP26]], ptr [[TMP28]], align 1
199; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 4
200; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]]
201; CHECK-NEXT:    br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
202; CHECK:       vec.epilog.middle.block:
203; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]]
204; CHECK-NEXT:    br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
205; CHECK:       vec.epilog.scalar.ph:
206; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
207; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
208; CHECK:       for.body:
209; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
210; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[IV]]
211; CHECK-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
212; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[IV]]
213; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX2]], align 1
214; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[TMP31]], [[TMP30]]
215; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV]]
216; CHECK-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX6]], align 1
217; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
218; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]]
219; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
220; CHECK:       exit:
221; CHECK-NEXT:    ret void
222;
223entry:
224  br label %for.body
225
226for.body:
227  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
228  %arrayidx = getelementptr inbounds i16, ptr %B, i64 %iv
229  %0 = load i16, ptr %arrayidx, align 1
230  %arrayidx2 = getelementptr inbounds i16, ptr %C, i64 %iv
231  %1 = load i16, ptr %arrayidx2, align 1
232  %add = add i16 %1, %0
233  %arrayidx6 = getelementptr inbounds i16, ptr %A, i64 %iv
234  store i16 %add, ptr %arrayidx6, align 1
235  %iv.next = add nuw nsw i64 %iv, 1
236  %exitcond.not = icmp eq i64 %iv.next, %Iterations
237  br i1 %exitcond.not, label %exit, label %for.body
238
239exit:
240  ret void
241}
242
243define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) {
244; CHECK-LABEL: @add_i32(
245; CHECK-NEXT:  iter.check:
246; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 4
247; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
248; CHECK:       vector.main.loop.iter.check:
249; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 16
250; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
251; CHECK:       vector.ph:
252; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 16
253; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]]
254; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
255; CHECK:       vector.body:
256; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
257; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
258; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
259; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
260; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
261; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
262; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
263; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
264; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
265; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
266; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
267; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
268; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
269; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 4
270; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8
271; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 12
272; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP7]], align 1
273; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1
274; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1
275; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
276; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]]
277; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
278; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
279; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
280; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
281; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
282; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 4
283; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 8
284; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 12
285; CHECK-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP16]], align 1
286; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP17]], align 1
287; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP18]], align 1
288; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP19]], align 1
289; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
290; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
291; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
292; CHECK:       middle.block:
293; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]]
294; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
295; CHECK:       vec.epilog.iter.check:
296; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]]
297; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
298; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
299; CHECK:       vec.epilog.ph:
300; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
301; CHECK-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 4
302; CHECK-NEXT:    [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]]
303; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
304; CHECK:       vec.epilog.vector.body:
305; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
306; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX11]], 0
307; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]]
308; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
309; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP23]], align 1
310; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP21]]
311; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
312; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP25]], align 1
313; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i32> [[WIDE_LOAD13]], [[WIDE_LOAD12]]
314; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
315; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
316; CHECK-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP28]], align 1
317; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 4
318; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]]
319; CHECK-NEXT:    br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
320; CHECK:       vec.epilog.middle.block:
321; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]]
322; CHECK-NEXT:    br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
323; CHECK:       vec.epilog.scalar.ph:
324; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
325; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
326; CHECK:       for.body:
327; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
328; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
329; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
330; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
331; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX2]], align 1
332; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP31]], [[TMP30]]
333; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
334; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX6]], align 1
335; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
336; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]]
337; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
338; CHECK:       exit:
339; CHECK-NEXT:    ret void
340;
341entry:
342  br label %for.body
343
344for.body:
345  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
346  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %iv
347  %0 = load i32, ptr %arrayidx, align 1
348  %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %iv
349  %1 = load i32, ptr %arrayidx2, align 1
350  %add = add i32 %1, %0
351  %arrayidx6 = getelementptr inbounds i32, ptr %A, i64 %iv
352  store i32 %add, ptr %arrayidx6, align 1
353  %iv.next = add nuw nsw i64 %iv, 1
354  %exitcond.not = icmp eq i64 %iv.next, %Iterations
355  br i1 %exitcond.not, label %exit, label %for.body
356
357exit:
358  ret void
359}
360