xref: /llvm-project/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll (revision 29441e4f5fa5f5c7709f7cf180815ba97f611297)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s
3
4define i32 @reduction_smin(ptr nocapture %A, ptr nocapture %B) {
5; CHECK-LABEL: define i32 @reduction_smin
6; CHECK-SAME: (ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
7; CHECK-NEXT:  entry:
8; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
9; CHECK:       vector.ph:
10; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
11; CHECK:       vector.body:
12; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
13; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
14; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
15; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
16; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]])
17; CHECK-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP1]], i32 [[VEC_PHI]])
18; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
19; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
20; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
21; CHECK:       middle.block:
22; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
23; CHECK:       scalar.ph:
24; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
25; CHECK:       for.body:
26; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
27; CHECK:       for.end:
28; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
29; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
30;
31entry:
32  br label %for.body
33
34for.body:                                         ; preds = %entry, %for.body
35  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
36  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
37  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
38  %l0 = load i32, ptr %arrayidx, align 4
39  %c0 = icmp slt i32 %result.08, %l0
40  %v0 = select i1 %c0, i32 %result.08, i32 %l0
41  %indvars.iv.next = add i64 %indvars.iv, 1
42  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
43  %exitcond = icmp eq i32 %lftr.wideiv, 256
44  br i1 %exitcond, label %for.end, label %for.body
45
46for.end:                                          ; preds = %for.body, %entry
47  %result.0.lcssa = phi i32 [ %v0, %for.body ]
48  ret i32 %result.0.lcssa
49}
50
51define i32 @reduction_smin_select_ops_flipped(ptr nocapture %A, ptr nocapture %B) {
52; CHECK-LABEL: define i32 @reduction_smin_select_ops_flipped
53; CHECK-SAME: (ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
54; CHECK-NEXT:  entry:
55; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
56; CHECK:       vector.ph:
57; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
58; CHECK:       vector.body:
59; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
60; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
61; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
62; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
63; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[WIDE_LOAD]])
64; CHECK-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 [[VEC_PHI]])
65; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
66; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
67; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
68; CHECK:       middle.block:
69; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
70; CHECK:       scalar.ph:
71; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
72; CHECK:       for.body:
73; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
74; CHECK:       for.end:
75; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
76; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
77;
78entry:
79  br label %for.body
80
81for.body:                                         ; preds = %entry, %for.body
82  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
83  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
84  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
85  %l0 = load i32, ptr %arrayidx, align 4
86  %c0 = icmp slt i32 %result.08, %l0
87  %v0 = select i1 %c0, i32 %l0, i32 %result.08
88  %indvars.iv.next = add i64 %indvars.iv, 1
89  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
90  %exitcond = icmp eq i32 %lftr.wideiv, 256
91  br i1 %exitcond, label %for.end, label %for.body
92
93for.end:                                          ; preds = %for.body, %entry
94  %result.0.lcssa = phi i32 [ %v0, %for.body ]
95  ret i32 %result.0.lcssa
96}
97
98define i32 @reduction_smin_intrinsic(ptr nocapture %A, ptr nocapture %B) {
99; CHECK-LABEL: define i32 @reduction_smin_intrinsic
100; CHECK-SAME: (ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
101; CHECK-NEXT:  entry:
102; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
103; CHECK:       vector.ph:
104; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
105; CHECK:       vector.body:
106; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
107; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1000), [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
108; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
109; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
110; CHECK-NEXT:    [[TMP1]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
111; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
112; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
113; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
114; CHECK:       middle.block:
115; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP1]])
116; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
117; CHECK:       scalar.ph:
118; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
119; CHECK:       for.body:
120; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
121; CHECK:       for.end:
122; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
123; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
124;
125entry:
126  br label %for.body
127
128for.body:                                         ; preds = %entry, %for.body
129  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
130  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
131  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
132  %l0 = load i32, ptr %arrayidx, align 4
133  %v0 = call i32 @llvm.smin.i32(i32 %result.08, i32 %l0)
134  %indvars.iv.next = add i64 %indvars.iv, 1
135  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
136  %exitcond = icmp eq i32 %lftr.wideiv, 256
137  br i1 %exitcond, label %for.end, label %for.body
138
139for.end:                                          ; preds = %for.body, %entry
140  %result.0.lcssa = phi i32 [ %v0, %for.body ]
141  ret i32 %result.0.lcssa
142}
143
144declare i32 @llvm.smin.i32(i32, i32)
145
146define i32 @reduction_umax(ptr nocapture %A, ptr nocapture %B) {
147; CHECK-LABEL: define i32 @reduction_umax
148; CHECK-SAME: (ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
149; CHECK-NEXT:  entry:
150; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
151; CHECK:       vector.ph:
152; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
153; CHECK:       vector.body:
154; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
155; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
156; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
157; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
158; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]])
159; CHECK-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 [[VEC_PHI]])
160; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
161; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
162; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
163; CHECK:       middle.block:
164; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
165; CHECK:       scalar.ph:
166; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
167; CHECK:       for.body:
168; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
169; CHECK:       for.end:
170; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
171; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
172;
173entry:
174  br label %for.body
175
176for.body:                                         ; preds = %entry, %for.body
177  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
178  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
179  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
180  %l0 = load i32, ptr %arrayidx, align 4
181  %c0 = icmp ugt i32 %result.08, %l0
182  %v0 = select i1 %c0, i32 %result.08, i32 %l0
183  %indvars.iv.next = add i64 %indvars.iv, 1
184  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
185  %exitcond = icmp eq i32 %lftr.wideiv, 256
186  br i1 %exitcond, label %for.end, label %for.body
187
188for.end:                                          ; preds = %for.body, %entry
189  %result.0.lcssa = phi i32 [ %v0, %for.body ]
190  ret i32 %result.0.lcssa
191}
192
193define i32 @reduction_umax_select_ops_flipped(ptr nocapture %A, ptr nocapture %B) {
194; CHECK-LABEL: define i32 @reduction_umax_select_ops_flipped
195; CHECK-SAME: (ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
196; CHECK-NEXT:  entry:
197; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
198; CHECK:       vector.ph:
199; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
200; CHECK:       vector.body:
201; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
202; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
203; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
204; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
205; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[WIDE_LOAD]])
206; CHECK-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[VEC_PHI]])
207; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
208; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
209; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
210; CHECK:       middle.block:
211; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
212; CHECK:       scalar.ph:
213; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
214; CHECK:       for.body:
215; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
216; CHECK:       for.end:
217; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
218; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
219;
220entry:
221  br label %for.body
222
223for.body:                                         ; preds = %entry, %for.body
224  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
225  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
226  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
227  %l0 = load i32, ptr %arrayidx, align 4
228  %c0 = icmp ugt i32 %result.08, %l0
229  %v0 = select i1 %c0, i32 %l0, i32 %result.08
230  %indvars.iv.next = add i64 %indvars.iv, 1
231  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
232  %exitcond = icmp eq i32 %lftr.wideiv, 256
233  br i1 %exitcond, label %for.end, label %for.body
234
235for.end:                                          ; preds = %for.body, %entry
236  %result.0.lcssa = phi i32 [ %v0, %for.body ]
237  ret i32 %result.0.lcssa
238}
239
240define i32 @reduction_umax_intrinsic(ptr nocapture %A, ptr nocapture %B) {
241; CHECK-LABEL: define i32 @reduction_umax_intrinsic
242; CHECK-SAME: (ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
243; CHECK-NEXT:  entry:
244; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
245; CHECK:       vector.ph:
246; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
247; CHECK:       vector.body:
248; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
249; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1000), [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
250; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
251; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
252; CHECK-NEXT:    [[TMP1]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
253; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
254; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
255; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
256; CHECK:       middle.block:
257; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP1]])
258; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
259; CHECK:       scalar.ph:
260; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
261; CHECK:       for.body:
262; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
263; CHECK:       for.end:
264; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
265; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
266;
267entry:
268  br label %for.body
269
270for.body:                                         ; preds = %entry, %for.body
271  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
272  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
273  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
274  %l0 = load i32, ptr %arrayidx, align 4
275  %v0 = call i32 @llvm.umax.i32(i32 %result.08, i32 %l0)
276  %indvars.iv.next = add i64 %indvars.iv, 1
277  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
278  %exitcond = icmp eq i32 %lftr.wideiv, 256
279  br i1 %exitcond, label %for.end, label %for.body
280
281for.end:                                          ; preds = %for.body, %entry
282  %result.0.lcssa = phi i32 [ %v0, %for.body ]
283  ret i32 %result.0.lcssa
284}
285
286declare i32 @llvm.umax.i32(i32, i32)
287