xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll (revision 38fffa630ee80163dc65e759392ad29798905679)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
3; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-threshold=-10 | FileCheck %s --check-prefix=THRESHOLD
4
5@n = external local_unnamed_addr global i32, align 4
6@arr = common local_unnamed_addr global [20 x float] zeroinitializer, align 16
7@arr1 = common local_unnamed_addr global [20 x float] zeroinitializer, align 16
8@res = external local_unnamed_addr global float, align 4
9
10define float @baz() {
11; CHECK-LABEL: @baz(
12; CHECK-NEXT:  entry:
13; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
14; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
15; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
16; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
17; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
18; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
19; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
20; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 2.000000e+00
21; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[CONV]], 2.000000e+00
22; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
23; CHECK-NEXT:    store float [[OP_RDX]], ptr @res, align 4
24; CHECK-NEXT:    ret float [[OP_RDX]]
25;
26; THRESHOLD-LABEL: @baz(
27; THRESHOLD-NEXT:  entry:
28; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
29; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
30; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
31; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
32; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
33; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
34; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
35; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0
36; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[CONV]], i32 1
37; THRESHOLD-NEXT:    [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], splat (float 2.000000e+00)
38; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
39; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
40; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
41; THRESHOLD-NEXT:    store float [[OP_RDX]], ptr @res, align 4
42; THRESHOLD-NEXT:    ret float [[OP_RDX]]
43;
44entry:
45  %0 = load i32, ptr @n, align 4
46  %mul = mul nsw i32 %0, 3
47  %conv = sitofp i32 %mul to float
48  %1 = load float, ptr @arr, align 16
49  %2 = load float, ptr @arr1, align 16
50  %mul4 = fmul fast float %2, %1
51  %add = fadd fast float %mul4, %conv
52  %3 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
53  %4 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
54  %mul4.1 = fmul fast float %4, %3
55  %add.1 = fadd fast float %mul4.1, %add
56  %5 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
57  %6 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
58  %mul4.2 = fmul fast float %6, %5
59  %add.2 = fadd fast float %mul4.2, %add.1
60  %7 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
61  %8 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
62  %mul4.3 = fmul fast float %8, %7
63  %add.3 = fadd fast float %mul4.3, %add.2
64  %add7 = fadd fast float %add.3, %conv
65  %add19 = fadd fast float %mul4, %add7
66  %add19.1 = fadd fast float %mul4.1, %add19
67  %add19.2 = fadd fast float %mul4.2, %add19.1
68  %add19.3 = fadd fast float %mul4.3, %add19.2
69  store float %add19.3, ptr @res, align 4
70  ret float %add19.3
71}
72
73define float @bazz() {
74; CHECK-LABEL: @bazz(
75; CHECK-NEXT:  entry:
76; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
77; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
78; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
79; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
80; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
81; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
82; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
83; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
84; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
85; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
86; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
87; CHECK-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
88; CHECK-NEXT:    ret float [[OP_RDX1]]
89;
90; THRESHOLD-LABEL: @bazz(
91; THRESHOLD-NEXT:  entry:
92; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
93; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
94; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
95; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
96; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
97; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
98; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
99; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
100; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
101; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
102; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
103; THRESHOLD-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
104; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
105;
106entry:
107  %0 = load i32, ptr @n, align 4
108  %mul = mul nsw i32 %0, 3
109  %conv = sitofp i32 %mul to float
110  %1 = load float, ptr @arr, align 16
111  %2 = load float, ptr @arr1, align 16
112  %mul4 = fmul fast float %2, %1
113  %add = fadd fast float %mul4, %conv
114  %3 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
115  %4 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
116  %mul4.1 = fmul fast float %4, %3
117  %add.1 = fadd fast float %mul4.1, %add
118  %5 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
119  %6 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
120  %mul4.2 = fmul fast float %6, %5
121  %add.2 = fadd fast float %mul4.2, %add.1
122  %7 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
123  %8 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
124  %mul4.3 = fmul fast float %8, %7
125  %add.3 = fadd fast float %mul4.3, %add.2
126  %mul5 = shl nsw i32 %0, 2
127  %conv6 = sitofp i32 %mul5 to float
128  %add7 = fadd fast float %add.3, %conv6
129  %9 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
130  %10 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
131  %mul18 = fmul fast float %10, %9
132  %add19 = fadd fast float %mul18, %add7
133  %11 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
134  %12 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
135  %mul18.1 = fmul fast float %12, %11
136  %add19.1 = fadd fast float %mul18.1, %add19
137  %13 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
138  %14 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
139  %mul18.2 = fmul fast float %14, %13
140  %add19.2 = fadd fast float %mul18.2, %add19.1
141  %15 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
142  %16 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
143  %mul18.3 = fmul fast float %16, %15
144  %add19.3 = fadd fast float %mul18.3, %add19.2
145  store float %add19.3, ptr @res, align 4
146  ret float %add19.3
147}
148
149define float @bazzz() {
150; CHECK-LABEL: @bazzz(
151; CHECK-NEXT:  entry:
152; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
153; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
154; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
155; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
156; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
157; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
158; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
159; CHECK-NEXT:    store float [[TMP5]], ptr @res, align 4
160; CHECK-NEXT:    ret float [[TMP5]]
161;
162; THRESHOLD-LABEL: @bazzz(
163; THRESHOLD-NEXT:  entry:
164; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
165; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
166; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
167; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
168; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
169; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
170; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
171; THRESHOLD-NEXT:    store float [[TMP5]], ptr @res, align 4
172; THRESHOLD-NEXT:    ret float [[TMP5]]
173;
174entry:
175  %0 = load i32, ptr @n, align 4
176  %conv = sitofp i32 %0 to float
177  %1 = load float, ptr @arr, align 16
178  %2 = load float, ptr @arr1, align 16
179  %mul = fmul fast float %2, %1
180  %3 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
181  %4 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
182  %mul.1 = fmul fast float %4, %3
183  %5 = fadd fast float %mul.1, %mul
184  %6 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
185  %7 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
186  %mul.2 = fmul fast float %7, %6
187  %8 = fadd fast float %mul.2, %5
188  %9 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
189  %10 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
190  %mul.3 = fmul fast float %10, %9
191  %11 = fadd fast float %mul.3, %8
192  %12 = fmul fast float %conv, %11
193  store float %12, ptr @res, align 4
194  ret float %12
195}
196
197define i32 @foo() {
198; CHECK-LABEL: @foo(
199; CHECK-NEXT:  entry:
200; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
201; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
202; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
203; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
204; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
205; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
206; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
207; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
208; CHECK-NEXT:    store i32 [[CONV4]], ptr @n, align 4
209; CHECK-NEXT:    ret i32 [[CONV4]]
210;
211; THRESHOLD-LABEL: @foo(
212; THRESHOLD-NEXT:  entry:
213; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
214; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
215; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
216; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
217; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
218; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
219; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
220; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
221; THRESHOLD-NEXT:    store i32 [[CONV4]], ptr @n, align 4
222; THRESHOLD-NEXT:    ret i32 [[CONV4]]
223;
224entry:
225  %0 = load i32, ptr @n, align 4
226  %conv = sitofp i32 %0 to float
227  %1 = load float, ptr @arr, align 16
228  %2 = load float, ptr @arr1, align 16
229  %mul = fmul fast float %2, %1
230  %3 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
231  %4 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
232  %mul.1 = fmul fast float %4, %3
233  %5 = fadd fast float %mul.1, %mul
234  %6 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
235  %7 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
236  %mul.2 = fmul fast float %7, %6
237  %8 = fadd fast float %mul.2, %5
238  %9 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
239  %10 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
240  %mul.3 = fmul fast float %10, %9
241  %11 = fadd fast float %mul.3, %8
242  %12 = fmul fast float %conv, %11
243  %conv4 = fptosi float %12 to i32
244  store i32 %conv4, ptr @n, align 4
245  ret i32 %conv4
246}
247
248; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
249; with fastmath on the select.
250define float @bar() {
251; CHECK-LABEL: @bar(
252; CHECK-NEXT:  entry:
253; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr @arr, align 16
254; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr @arr1, align 16
255; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
256; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
257; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
258; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
259; CHECK-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
260; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
261; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
262; CHECK-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
263; CHECK-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
264; CHECK-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
265; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
266; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
267; CHECK-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
268; CHECK-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
269; CHECK-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
270; CHECK-NEXT:    store float [[MAX_0_MUL3_2]], ptr @res, align 4
271; CHECK-NEXT:    ret float [[MAX_0_MUL3_2]]
272;
273; THRESHOLD-LABEL: @bar(
274; THRESHOLD-NEXT:  entry:
275; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr @arr, align 16
276; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr @arr1, align 16
277; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
278; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
279; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
280; THRESHOLD-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
281; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
282; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
283; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
284; THRESHOLD-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
285; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
286; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
287; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
288; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
289; THRESHOLD-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
290; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
291; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
292; THRESHOLD-NEXT:    store float [[MAX_0_MUL3_2]], ptr @res, align 4
293; THRESHOLD-NEXT:    ret float [[MAX_0_MUL3_2]]
294;
295entry:
296  %0 = load float, ptr @arr, align 16
297  %1 = load float, ptr @arr1, align 16
298  %mul = fmul fast float %1, %0
299  %2 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
300  %3 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
301  %mul3 = fmul fast float %3, %2
302  %cmp4 = fcmp fast ogt float %mul, %mul3
303  %max.0.mul3 = select i1 %cmp4, float %mul, float %mul3
304  %4 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
305  %5 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
306  %mul3.1 = fmul fast float %5, %4
307  %cmp4.1 = fcmp fast ogt float %max.0.mul3, %mul3.1
308  %max.0.mul3.1 = select i1 %cmp4.1, float %max.0.mul3, float %mul3.1
309  %6 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
310  %7 = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
311  %mul3.2 = fmul fast float %7, %6
312  %cmp4.2 = fcmp fast ogt float %max.0.mul3.1, %mul3.2
313  %max.0.mul3.2 = select i1 %cmp4.2, float %max.0.mul3.1, float %mul3.2
314  store float %max.0.mul3.2, ptr @res, align 4
315  ret float %max.0.mul3.2
316}
317
318define float @f(ptr nocapture readonly %x) {
319; CHECK-LABEL: @f(
320; CHECK-NEXT:  entry:
321; CHECK-NEXT:    [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
322; CHECK-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
323; CHECK-NEXT:    ret float [[OP_RDX]]
324;
325; THRESHOLD-LABEL: @f(
326; THRESHOLD-NEXT:  entry:
327; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
328; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
329; THRESHOLD-NEXT:    ret float [[OP_RDX]]
330;
331  entry:
332  %0 = load float, ptr %x, align 4
333  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
334  %1 = load float, ptr %arrayidx.1, align 4
335  %add.1 = fadd fast float %1, %0
336  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
337  %2 = load float, ptr %arrayidx.2, align 4
338  %add.2 = fadd fast float %2, %add.1
339  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
340  %3 = load float, ptr %arrayidx.3, align 4
341  %add.3 = fadd fast float %3, %add.2
342  %arrayidx.4 = getelementptr inbounds float, ptr %x, i64 4
343  %4 = load float, ptr %arrayidx.4, align 4
344  %add.4 = fadd fast float %4, %add.3
345  %arrayidx.5 = getelementptr inbounds float, ptr %x, i64 5
346  %5 = load float, ptr %arrayidx.5, align 4
347  %add.5 = fadd fast float %5, %add.4
348  %arrayidx.6 = getelementptr inbounds float, ptr %x, i64 6
349  %6 = load float, ptr %arrayidx.6, align 4
350  %add.6 = fadd fast float %6, %add.5
351  %arrayidx.7 = getelementptr inbounds float, ptr %x, i64 7
352  %7 = load float, ptr %arrayidx.7, align 4
353  %add.7 = fadd fast float %7, %add.6
354  %arrayidx.8 = getelementptr inbounds float, ptr %x, i64 8
355  %8 = load float, ptr %arrayidx.8, align 4
356  %add.8 = fadd fast float %8, %add.7
357  %arrayidx.9 = getelementptr inbounds float, ptr %x, i64 9
358  %9 = load float, ptr %arrayidx.9, align 4
359  %add.9 = fadd fast float %9, %add.8
360  %arrayidx.10 = getelementptr inbounds float, ptr %x, i64 10
361  %10 = load float, ptr %arrayidx.10, align 4
362  %add.10 = fadd fast float %10, %add.9
363  %arrayidx.11 = getelementptr inbounds float, ptr %x, i64 11
364  %11 = load float, ptr %arrayidx.11, align 4
365  %add.11 = fadd fast float %11, %add.10
366  %arrayidx.12 = getelementptr inbounds float, ptr %x, i64 12
367  %12 = load float, ptr %arrayidx.12, align 4
368  %add.12 = fadd fast float %12, %add.11
369  %arrayidx.13 = getelementptr inbounds float, ptr %x, i64 13
370  %13 = load float, ptr %arrayidx.13, align 4
371  %add.13 = fadd fast float %13, %add.12
372  %arrayidx.14 = getelementptr inbounds float, ptr %x, i64 14
373  %14 = load float, ptr %arrayidx.14, align 4
374  %add.14 = fadd fast float %14, %add.13
375  %arrayidx.15 = getelementptr inbounds float, ptr %x, i64 15
376  %15 = load float, ptr %arrayidx.15, align 4
377  %add.15 = fadd fast float %15, %add.14
378  %arrayidx.16 = getelementptr inbounds float, ptr %x, i64 16
379  %16 = load float, ptr %arrayidx.16, align 4
380  %add.16 = fadd fast float %16, %add.15
381  %arrayidx.17 = getelementptr inbounds float, ptr %x, i64 17
382  %17 = load float, ptr %arrayidx.17, align 4
383  %add.17 = fadd fast float %17, %add.16
384  %arrayidx.18 = getelementptr inbounds float, ptr %x, i64 18
385  %18 = load float, ptr %arrayidx.18, align 4
386  %add.18 = fadd fast float %18, %add.17
387  %arrayidx.19 = getelementptr inbounds float, ptr %x, i64 19
388  %19 = load float, ptr %arrayidx.19, align 4
389  %add.19 = fadd fast float %19, %add.18
390  %arrayidx.20 = getelementptr inbounds float, ptr %x, i64 20
391  %20 = load float, ptr %arrayidx.20, align 4
392  %add.20 = fadd fast float %20, %add.19
393  %arrayidx.21 = getelementptr inbounds float, ptr %x, i64 21
394  %21 = load float, ptr %arrayidx.21, align 4
395  %add.21 = fadd fast float %21, %add.20
396  %arrayidx.22 = getelementptr inbounds float, ptr %x, i64 22
397  %22 = load float, ptr %arrayidx.22, align 4
398  %add.22 = fadd fast float %22, %add.21
399  %arrayidx.23 = getelementptr inbounds float, ptr %x, i64 23
400  %23 = load float, ptr %arrayidx.23, align 4
401  %add.23 = fadd fast float %23, %add.22
402  %arrayidx.24 = getelementptr inbounds float, ptr %x, i64 24
403  %24 = load float, ptr %arrayidx.24, align 4
404  %add.24 = fadd fast float %24, %add.23
405  %arrayidx.25 = getelementptr inbounds float, ptr %x, i64 25
406  %25 = load float, ptr %arrayidx.25, align 4
407  %add.25 = fadd fast float %25, %add.24
408  %arrayidx.26 = getelementptr inbounds float, ptr %x, i64 26
409  %26 = load float, ptr %arrayidx.26, align 4
410  %add.26 = fadd fast float %26, %add.25
411  %arrayidx.27 = getelementptr inbounds float, ptr %x, i64 27
412  %27 = load float, ptr %arrayidx.27, align 4
413  %add.27 = fadd fast float %27, %add.26
414  %arrayidx.28 = getelementptr inbounds float, ptr %x, i64 28
415  %28 = load float, ptr %arrayidx.28, align 4
416  %add.28 = fadd fast float %28, %add.27
417  %arrayidx.29 = getelementptr inbounds float, ptr %x, i64 29
418  %29 = load float, ptr %arrayidx.29, align 4
419  %add.29 = fadd fast float %29, %add.28
420  %arrayidx.30 = getelementptr inbounds float, ptr %x, i64 30
421  %30 = load float, ptr %arrayidx.30, align 4
422  %add.30 = fadd fast float %30, %add.29
423  %arrayidx.31 = getelementptr inbounds float, ptr %x, i64 31
424  %31 = load float, ptr %arrayidx.31, align 4
425  %add.31 = fadd fast float %31, %add.30
426  %arrayidx.32 = getelementptr inbounds float, ptr %x, i64 32
427  %32 = load float, ptr %arrayidx.32, align 4
428  %add.32 = fadd fast float %32, %add.31
429  %arrayidx.33 = getelementptr inbounds float, ptr %x, i64 33
430  %33 = load float, ptr %arrayidx.33, align 4
431  %add.33 = fadd fast float %33, %add.32
432  %arrayidx.34 = getelementptr inbounds float, ptr %x, i64 34
433  %34 = load float, ptr %arrayidx.34, align 4
434  %add.34 = fadd fast float %34, %add.33
435  %arrayidx.35 = getelementptr inbounds float, ptr %x, i64 35
436  %35 = load float, ptr %arrayidx.35, align 4
437  %add.35 = fadd fast float %35, %add.34
438  %arrayidx.36 = getelementptr inbounds float, ptr %x, i64 36
439  %36 = load float, ptr %arrayidx.36, align 4
440  %add.36 = fadd fast float %36, %add.35
441  %arrayidx.37 = getelementptr inbounds float, ptr %x, i64 37
442  %37 = load float, ptr %arrayidx.37, align 4
443  %add.37 = fadd fast float %37, %add.36
444  %arrayidx.38 = getelementptr inbounds float, ptr %x, i64 38
445  %38 = load float, ptr %arrayidx.38, align 4
446  %add.38 = fadd fast float %38, %add.37
447  %arrayidx.39 = getelementptr inbounds float, ptr %x, i64 39
448  %39 = load float, ptr %arrayidx.39, align 4
449  %add.39 = fadd fast float %39, %add.38
450  %arrayidx.40 = getelementptr inbounds float, ptr %x, i64 40
451  %40 = load float, ptr %arrayidx.40, align 4
452  %add.40 = fadd fast float %40, %add.39
453  %arrayidx.41 = getelementptr inbounds float, ptr %x, i64 41
454  %41 = load float, ptr %arrayidx.41, align 4
455  %add.41 = fadd fast float %41, %add.40
456  %arrayidx.42 = getelementptr inbounds float, ptr %x, i64 42
457  %42 = load float, ptr %arrayidx.42, align 4
458  %add.42 = fadd fast float %42, %add.41
459  %arrayidx.43 = getelementptr inbounds float, ptr %x, i64 43
460  %43 = load float, ptr %arrayidx.43, align 4
461  %add.43 = fadd fast float %43, %add.42
462  %arrayidx.44 = getelementptr inbounds float, ptr %x, i64 44
463  %44 = load float, ptr %arrayidx.44, align 4
464  %add.44 = fadd fast float %44, %add.43
465  %arrayidx.45 = getelementptr inbounds float, ptr %x, i64 45
466  %45 = load float, ptr %arrayidx.45, align 4
467  %add.45 = fadd fast float %45, %add.44
468  %arrayidx.46 = getelementptr inbounds float, ptr %x, i64 46
469  %46 = load float, ptr %arrayidx.46, align 4
470  %add.46 = fadd fast float %46, %add.45
471  %arrayidx.47 = getelementptr inbounds float, ptr %x, i64 47
472  %47 = load float, ptr %arrayidx.47, align 4
473  %add.47 = fadd fast float %47, %add.46
474  ret float %add.47
475}
476
477define float @f1(ptr nocapture readonly %x, i32 %a, i32 %b) {
478; CHECK-LABEL: @f1(
479; CHECK-NEXT:  entry:
480; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
481; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
482; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
483; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
484; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[CONV]]
485; CHECK-NEXT:    ret float [[OP_RDX]]
486;
487; THRESHOLD-LABEL: @f1(
488; THRESHOLD-NEXT:  entry:
489; THRESHOLD-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
490; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
491; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
492; THRESHOLD-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
493; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[CONV]]
494; THRESHOLD-NEXT:    ret float [[OP_RDX]]
495;
496  entry:
497  %rem = srem i32 %a, %b
498  %conv = sitofp i32 %rem to float
499  %0 = load float, ptr %x, align 4
500  %add = fadd fast float %0, %conv
501  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
502  %1 = load float, ptr %arrayidx.1, align 4
503  %add.1 = fadd fast float %1, %add
504  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
505  %2 = load float, ptr %arrayidx.2, align 4
506  %add.2 = fadd fast float %2, %add.1
507  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
508  %3 = load float, ptr %arrayidx.3, align 4
509  %add.3 = fadd fast float %3, %add.2
510  %arrayidx.4 = getelementptr inbounds float, ptr %x, i64 4
511  %4 = load float, ptr %arrayidx.4, align 4
512  %add.4 = fadd fast float %4, %add.3
513  %arrayidx.5 = getelementptr inbounds float, ptr %x, i64 5
514  %5 = load float, ptr %arrayidx.5, align 4
515  %add.5 = fadd fast float %5, %add.4
516  %arrayidx.6 = getelementptr inbounds float, ptr %x, i64 6
517  %6 = load float, ptr %arrayidx.6, align 4
518  %add.6 = fadd fast float %6, %add.5
519  %arrayidx.7 = getelementptr inbounds float, ptr %x, i64 7
520  %7 = load float, ptr %arrayidx.7, align 4
521  %add.7 = fadd fast float %7, %add.6
522  %arrayidx.8 = getelementptr inbounds float, ptr %x, i64 8
523  %8 = load float, ptr %arrayidx.8, align 4
524  %add.8 = fadd fast float %8, %add.7
525  %arrayidx.9 = getelementptr inbounds float, ptr %x, i64 9
526  %9 = load float, ptr %arrayidx.9, align 4
527  %add.9 = fadd fast float %9, %add.8
528  %arrayidx.10 = getelementptr inbounds float, ptr %x, i64 10
529  %10 = load float, ptr %arrayidx.10, align 4
530  %add.10 = fadd fast float %10, %add.9
531  %arrayidx.11 = getelementptr inbounds float, ptr %x, i64 11
532  %11 = load float, ptr %arrayidx.11, align 4
533  %add.11 = fadd fast float %11, %add.10
534  %arrayidx.12 = getelementptr inbounds float, ptr %x, i64 12
535  %12 = load float, ptr %arrayidx.12, align 4
536  %add.12 = fadd fast float %12, %add.11
537  %arrayidx.13 = getelementptr inbounds float, ptr %x, i64 13
538  %13 = load float, ptr %arrayidx.13, align 4
539  %add.13 = fadd fast float %13, %add.12
540  %arrayidx.14 = getelementptr inbounds float, ptr %x, i64 14
541  %14 = load float, ptr %arrayidx.14, align 4
542  %add.14 = fadd fast float %14, %add.13
543  %arrayidx.15 = getelementptr inbounds float, ptr %x, i64 15
544  %15 = load float, ptr %arrayidx.15, align 4
545  %add.15 = fadd fast float %15, %add.14
546  %arrayidx.16 = getelementptr inbounds float, ptr %x, i64 16
547  %16 = load float, ptr %arrayidx.16, align 4
548  %add.16 = fadd fast float %16, %add.15
549  %arrayidx.17 = getelementptr inbounds float, ptr %x, i64 17
550  %17 = load float, ptr %arrayidx.17, align 4
551  %add.17 = fadd fast float %17, %add.16
552  %arrayidx.18 = getelementptr inbounds float, ptr %x, i64 18
553  %18 = load float, ptr %arrayidx.18, align 4
554  %add.18 = fadd fast float %18, %add.17
555  %arrayidx.19 = getelementptr inbounds float, ptr %x, i64 19
556  %19 = load float, ptr %arrayidx.19, align 4
557  %add.19 = fadd fast float %19, %add.18
558  %arrayidx.20 = getelementptr inbounds float, ptr %x, i64 20
559  %20 = load float, ptr %arrayidx.20, align 4
560  %add.20 = fadd fast float %20, %add.19
561  %arrayidx.21 = getelementptr inbounds float, ptr %x, i64 21
562  %21 = load float, ptr %arrayidx.21, align 4
563  %add.21 = fadd fast float %21, %add.20
564  %arrayidx.22 = getelementptr inbounds float, ptr %x, i64 22
565  %22 = load float, ptr %arrayidx.22, align 4
566  %add.22 = fadd fast float %22, %add.21
567  %arrayidx.23 = getelementptr inbounds float, ptr %x, i64 23
568  %23 = load float, ptr %arrayidx.23, align 4
569  %add.23 = fadd fast float %23, %add.22
570  %arrayidx.24 = getelementptr inbounds float, ptr %x, i64 24
571  %24 = load float, ptr %arrayidx.24, align 4
572  %add.24 = fadd fast float %24, %add.23
573  %arrayidx.25 = getelementptr inbounds float, ptr %x, i64 25
574  %25 = load float, ptr %arrayidx.25, align 4
575  %add.25 = fadd fast float %25, %add.24
576  %arrayidx.26 = getelementptr inbounds float, ptr %x, i64 26
577  %26 = load float, ptr %arrayidx.26, align 4
578  %add.26 = fadd fast float %26, %add.25
579  %arrayidx.27 = getelementptr inbounds float, ptr %x, i64 27
580  %27 = load float, ptr %arrayidx.27, align 4
581  %add.27 = fadd fast float %27, %add.26
582  %arrayidx.28 = getelementptr inbounds float, ptr %x, i64 28
583  %28 = load float, ptr %arrayidx.28, align 4
584  %add.28 = fadd fast float %28, %add.27
585  %arrayidx.29 = getelementptr inbounds float, ptr %x, i64 29
586  %29 = load float, ptr %arrayidx.29, align 4
587  %add.29 = fadd fast float %29, %add.28
588  %arrayidx.30 = getelementptr inbounds float, ptr %x, i64 30
589  %30 = load float, ptr %arrayidx.30, align 4
590  %add.30 = fadd fast float %30, %add.29
591  %arrayidx.31 = getelementptr inbounds float, ptr %x, i64 31
592  %31 = load float, ptr %arrayidx.31, align 4
593  %add.31 = fadd fast float %31, %add.30
594  ret float %add.31
595}
596
597define float @loadadd31(ptr nocapture readonly %x) {
598; CHECK-LABEL: @loadadd31(
599; CHECK-NEXT:  entry:
600; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
601; CHECK-NEXT:    [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
602; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
603; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
604; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
605; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
606; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
607; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
608; CHECK-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
609; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
610; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
611; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
612; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
613; CHECK-NEXT:    ret float [[OP_RDX3]]
614;
615; THRESHOLD-LABEL: @loadadd31(
616; THRESHOLD-NEXT:  entry:
617; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
618; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
619; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
620; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
621; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
622; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
623; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
624; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
625; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
626; THRESHOLD-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
627; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
628; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
629; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
630; THRESHOLD-NEXT:    ret float [[OP_RDX3]]
631;
632  entry:
633  %arrayidx = getelementptr inbounds float, ptr %x, i64 1
634  %0 = load float, ptr %arrayidx, align 4
635  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 2
636  %1 = load float, ptr %arrayidx.1, align 4
637  %add.1 = fadd fast float %1, %0
638  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 3
639  %2 = load float, ptr %arrayidx.2, align 4
640  %add.2 = fadd fast float %2, %add.1
641  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 4
642  %3 = load float, ptr %arrayidx.3, align 4
643  %add.3 = fadd fast float %3, %add.2
644  %arrayidx.4 = getelementptr inbounds float, ptr %x, i64 5
645  %4 = load float, ptr %arrayidx.4, align 4
646  %add.4 = fadd fast float %4, %add.3
647  %arrayidx.5 = getelementptr inbounds float, ptr %x, i64 6
648  %5 = load float, ptr %arrayidx.5, align 4
649  %add.5 = fadd fast float %5, %add.4
650  %arrayidx.6 = getelementptr inbounds float, ptr %x, i64 7
651  %6 = load float, ptr %arrayidx.6, align 4
652  %add.6 = fadd fast float %6, %add.5
653  %arrayidx.7 = getelementptr inbounds float, ptr %x, i64 8
654  %7 = load float, ptr %arrayidx.7, align 4
655  %add.7 = fadd fast float %7, %add.6
656  %arrayidx.8 = getelementptr inbounds float, ptr %x, i64 9
657  %8 = load float, ptr %arrayidx.8, align 4
658  %add.8 = fadd fast float %8, %add.7
659  %arrayidx.9 = getelementptr inbounds float, ptr %x, i64 10
660  %9 = load float, ptr %arrayidx.9, align 4
661  %add.9 = fadd fast float %9, %add.8
662  %arrayidx.10 = getelementptr inbounds float, ptr %x, i64 11
663  %10 = load float, ptr %arrayidx.10, align 4
664  %add.10 = fadd fast float %10, %add.9
665  %arrayidx.11 = getelementptr inbounds float, ptr %x, i64 12
666  %11 = load float, ptr %arrayidx.11, align 4
667  %add.11 = fadd fast float %11, %add.10
668  %arrayidx.12 = getelementptr inbounds float, ptr %x, i64 13
669  %12 = load float, ptr %arrayidx.12, align 4
670  %add.12 = fadd fast float %12, %add.11
671  %arrayidx.13 = getelementptr inbounds float, ptr %x, i64 14
672  %13 = load float, ptr %arrayidx.13, align 4
673  %add.13 = fadd fast float %13, %add.12
674  %arrayidx.14 = getelementptr inbounds float, ptr %x, i64 15
675  %14 = load float, ptr %arrayidx.14, align 4
676  %add.14 = fadd fast float %14, %add.13
677  %arrayidx.15 = getelementptr inbounds float, ptr %x, i64 16
678  %15 = load float, ptr %arrayidx.15, align 4
679  %add.15 = fadd fast float %15, %add.14
680  %arrayidx.16 = getelementptr inbounds float, ptr %x, i64 17
681  %16 = load float, ptr %arrayidx.16, align 4
682  %add.16 = fadd fast float %16, %add.15
683  %arrayidx.17 = getelementptr inbounds float, ptr %x, i64 18
684  %17 = load float, ptr %arrayidx.17, align 4
685  %add.17 = fadd fast float %17, %add.16
686  %arrayidx.18 = getelementptr inbounds float, ptr %x, i64 19
687  %18 = load float, ptr %arrayidx.18, align 4
688  %add.18 = fadd fast float %18, %add.17
689  %arrayidx.19 = getelementptr inbounds float, ptr %x, i64 20
690  %19 = load float, ptr %arrayidx.19, align 4
691  %add.19 = fadd fast float %19, %add.18
692  %arrayidx.20 = getelementptr inbounds float, ptr %x, i64 21
693  %20 = load float, ptr %arrayidx.20, align 4
694  %add.20 = fadd fast float %20, %add.19
695  %arrayidx.21 = getelementptr inbounds float, ptr %x, i64 22
696  %21 = load float, ptr %arrayidx.21, align 4
697  %add.21 = fadd fast float %21, %add.20
698  %arrayidx.22 = getelementptr inbounds float, ptr %x, i64 23
699  %22 = load float, ptr %arrayidx.22, align 4
700  %add.22 = fadd fast float %22, %add.21
701  %arrayidx.23 = getelementptr inbounds float, ptr %x, i64 24
702  %23 = load float, ptr %arrayidx.23, align 4
703  %add.23 = fadd fast float %23, %add.22
704  %arrayidx.24 = getelementptr inbounds float, ptr %x, i64 25
705  %24 = load float, ptr %arrayidx.24, align 4
706  %add.24 = fadd fast float %24, %add.23
707  %arrayidx.25 = getelementptr inbounds float, ptr %x, i64 26
708  %25 = load float, ptr %arrayidx.25, align 4
709  %add.25 = fadd fast float %25, %add.24
710  %arrayidx.26 = getelementptr inbounds float, ptr %x, i64 27
711  %26 = load float, ptr %arrayidx.26, align 4
712  %add.26 = fadd fast float %26, %add.25
713  %arrayidx.27 = getelementptr inbounds float, ptr %x, i64 28
714  %27 = load float, ptr %arrayidx.27, align 4
715  %add.27 = fadd fast float %27, %add.26
716  %arrayidx.28 = getelementptr inbounds float, ptr %x, i64 29
717  %28 = load float, ptr %arrayidx.28, align 4
718  %add.28 = fadd fast float %28, %add.27
719  %arrayidx.29 = getelementptr inbounds float, ptr %x, i64 30
720  %29 = load float, ptr %arrayidx.29, align 4
721  %add.29 = fadd fast float %29, %add.28
722  ret float %add.29
723}
724
725define float @extra_args(ptr nocapture readonly %x, i32 %a, i32 %b) {
726; CHECK-LABEL: @extra_args(
727; CHECK-NEXT:  entry:
728; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
729; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
730; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
731; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
732; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
733; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]]
734; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
735; CHECK-NEXT:    ret float [[OP_RDX1]]
736;
737; THRESHOLD-LABEL: @extra_args(
738; THRESHOLD-NEXT:  entry:
739; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
740; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
741; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
742; THRESHOLD-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
743; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
744; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]]
745; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
746; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
747;
748  entry:
749  %mul = mul nsw i32 %b, %a
750  %conv = sitofp i32 %mul to float
751  %0 = load float, ptr %x, align 4
752  %add = fadd fast float %conv, 3.000000e+00
753  %add1 = fadd fast float %0, %add
754  %arrayidx3 = getelementptr inbounds float, ptr %x, i64 1
755  %1 = load float, ptr %arrayidx3, align 4
756  %add4 = fadd fast float %1, %add1
757  %add5 = fadd fast float %add4, %conv
758  %arrayidx3.1 = getelementptr inbounds float, ptr %x, i64 2
759  %2 = load float, ptr %arrayidx3.1, align 4
760  %add4.1 = fadd fast float %2, %add5
761  %arrayidx3.2 = getelementptr inbounds float, ptr %x, i64 3
762  %3 = load float, ptr %arrayidx3.2, align 4
763  %add4.2 = fadd fast float %3, %add4.1
764  %arrayidx3.3 = getelementptr inbounds float, ptr %x, i64 4
765  %4 = load float, ptr %arrayidx3.3, align 4
766  %add4.3 = fadd fast float %4, %add4.2
767  %arrayidx3.4 = getelementptr inbounds float, ptr %x, i64 5
768  %5 = load float, ptr %arrayidx3.4, align 4
769  %add4.4 = fadd fast float %5, %add4.3
770  %arrayidx3.5 = getelementptr inbounds float, ptr %x, i64 6
771  %6 = load float, ptr %arrayidx3.5, align 4
772  %add4.5 = fadd fast float %6, %add4.4
773  %arrayidx3.6 = getelementptr inbounds float, ptr %x, i64 7
774  %7 = load float, ptr %arrayidx3.6, align 4
775  %add4.6 = fadd fast float %7, %add4.5
776  ret float %add4.6
777}
778
779define float @extra_args_same_several_times(ptr nocapture readonly %x, i32 %a, i32 %b) {
780; CHECK-LABEL: @extra_args_same_several_times(
781; CHECK-NEXT:  entry:
782; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
783; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
784; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
785; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
786; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], 1.300000e+01
787; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
788; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP2]]
789; CHECK-NEXT:    ret float [[OP_RDX1]]
790;
791; THRESHOLD-LABEL: @extra_args_same_several_times(
792; THRESHOLD-NEXT:  entry:
793; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
794; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
795; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
796; THRESHOLD-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
797; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], 1.300000e+01
798; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
799; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP2]]
800; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
801;
802  entry:
803  %mul = mul nsw i32 %b, %a
804  %conv = sitofp i32 %mul to float
805  %0 = load float, ptr %x, align 4
806  %add = fadd fast float %conv, 3.000000e+00
807  %add1 = fadd fast float %0, %add
808  %arrayidx3 = getelementptr inbounds float, ptr %x, i64 1
809  %1 = load float, ptr %arrayidx3, align 4
810  %add4 = fadd fast float %1, %add1
811  %add41 = fadd fast float %add4, 5.000000e+00
812  %add5 = fadd fast float %add41, %conv
813  %arrayidx3.1 = getelementptr inbounds float, ptr %x, i64 2
814  %2 = load float, ptr %arrayidx3.1, align 4
815  %add4.1 = fadd fast float %2, %add5
816  %add4.11 = fadd fast float %add4.1, 5.000000e+00
817  %arrayidx3.2 = getelementptr inbounds float, ptr %x, i64 3
818  %3 = load float, ptr %arrayidx3.2, align 4
819  %add4.2 = fadd fast float %3, %add4.11
820  %arrayidx3.3 = getelementptr inbounds float, ptr %x, i64 4
821  %4 = load float, ptr %arrayidx3.3, align 4
822  %add4.3 = fadd fast float %4, %add4.2
823  %arrayidx3.4 = getelementptr inbounds float, ptr %x, i64 5
824  %5 = load float, ptr %arrayidx3.4, align 4
825  %add4.4 = fadd fast float %5, %add4.3
826  %arrayidx3.5 = getelementptr inbounds float, ptr %x, i64 6
827  %6 = load float, ptr %arrayidx3.5, align 4
828  %add4.5 = fadd fast float %6, %add4.4
829  %arrayidx3.6 = getelementptr inbounds float, ptr %x, i64 7
830  %7 = load float, ptr %arrayidx3.6, align 4
831  %add4.6 = fadd fast float %7, %add4.5
832  ret float %add4.6
833}
834
835define float @extra_args_no_replace(ptr nocapture readonly %x, i32 %a, i32 %b, i32 %c) {
836; CHECK-LABEL: @extra_args_no_replace(
837; CHECK-NEXT:  entry:
838; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
839; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
840; CHECK-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
841; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
842; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
843; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
844; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]]
845; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
846; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]]
847; CHECK-NEXT:    ret float [[OP_RDX2]]
848;
849; THRESHOLD-LABEL: @extra_args_no_replace(
850; THRESHOLD-NEXT:  entry:
851; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
852; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
853; THRESHOLD-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
854; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
855; THRESHOLD-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
856; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
857; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]]
858; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
859; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]]
860; THRESHOLD-NEXT:    ret float [[OP_RDX2]]
861;
862  entry:
863  %mul = mul nsw i32 %b, %a
864  %conv = sitofp i32 %mul to float
865  %0 = load float, ptr %x, align 4
866  %convc = sitofp i32 %c to float
867  %addc = fadd fast float %convc, 3.000000e+00
868  %add = fadd fast float %conv, %addc
869  %add1 = fadd fast float %0, %add
870  %arrayidx3 = getelementptr inbounds float, ptr %x, i64 1
871  %1 = load float, ptr %arrayidx3, align 4
872  %add4 = fadd fast float %1, %add1
873  %arrayidx3.1 = getelementptr inbounds float, ptr %x, i64 2
874  %2 = load float, ptr %arrayidx3.1, align 4
875  %add4.1 = fadd fast float %2, %add4
876  %arrayidx3.2 = getelementptr inbounds float, ptr %x, i64 3
877  %3 = load float, ptr %arrayidx3.2, align 4
878  %add4.2 = fadd fast float %3, %add4.1
879  %arrayidx3.3 = getelementptr inbounds float, ptr %x, i64 4
880  %4 = load float, ptr %arrayidx3.3, align 4
881  %add4.3 = fadd fast float %4, %add4.2
882  %add5 = fadd fast float %add4.3, %conv
883  %arrayidx3.4 = getelementptr inbounds float, ptr %x, i64 5
884  %5 = load float, ptr %arrayidx3.4, align 4
885  %add4.4 = fadd fast float %5, %add5
886  %arrayidx3.5 = getelementptr inbounds float, ptr %x, i64 6
887  %6 = load float, ptr %arrayidx3.5, align 4
888  %add4.5 = fadd fast float %6, %add4.4
889  %arrayidx3.6 = getelementptr inbounds float, ptr %x, i64 7
890  %7 = load float, ptr %arrayidx3.6, align 4
891  %add4.6 = fadd fast float %7, %add4.5
892  ret float %add4.6
893}
894
895define float @extra_args_no_fast(ptr %x, float %a, float %b) {
896; CHECK-LABEL: @extra_args_no_fast(
897; CHECK-NEXT:    [[ADDC:%.*]] = fadd fast float [[B:%.*]], 3.000000e+00
898; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[A:%.*]], [[ADDC]]
899; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
900; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
901; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
902; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
903; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
904; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4
905; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[ARRAYIDX3_2]], align 4
906; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[T0]], [[ADD]]
907; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[T1]], [[ADD1]]
908; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd float [[T2]], [[ADD4]]
909; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float [[T3]], [[ADD4_1]]
910; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_2]], [[A]]
911; CHECK-NEXT:    ret float [[ADD5]]
912;
913; THRESHOLD-LABEL: @extra_args_no_fast(
914; THRESHOLD-NEXT:    [[ADDC:%.*]] = fadd fast float [[B:%.*]], 3.000000e+00
915; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[A:%.*]], [[ADDC]]
916; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
917; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
918; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
919; THRESHOLD-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
920; THRESHOLD-NEXT:    [[T1:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
921; THRESHOLD-NEXT:    [[T2:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4
922; THRESHOLD-NEXT:    [[T3:%.*]] = load float, ptr [[ARRAYIDX3_2]], align 4
923; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float [[T0]], [[ADD]]
924; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float [[T1]], [[ADD1]]
925; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd float [[T2]], [[ADD4]]
926; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float [[T3]], [[ADD4_1]]
927; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_2]], [[A]]
928; THRESHOLD-NEXT:    ret float [[ADD5]]
929;
930  %addc = fadd fast float %b, 3.0
931  %add = fadd fast float %a, %addc
932  %arrayidx3 = getelementptr inbounds float, ptr %x, i64 1
933  %arrayidx3.1 = getelementptr inbounds float, ptr %x, i64 2
934  %arrayidx3.2 = getelementptr inbounds float, ptr %x, i64 3
935  %t0 = load float, ptr %x, align 4
936  %t1 = load float, ptr %arrayidx3, align 4
937  %t2 = load float, ptr %arrayidx3.1, align 4
938  %t3 = load float, ptr %arrayidx3.2, align 4
939  %add1 = fadd fast float %t0, %add
940  %add4 = fadd fast float %t1, %add1
941  %add4.1 = fadd float %t2, %add4  ; this is not a reduction candidate
942  %add4.2 = fadd fast float %t3, %add4.1
943  %add5 = fadd fast float %add4.2, %a
944  ret float %add5
945}
946
947define i32 @wobble(i32 %arg, i32 %bar) {
948; CHECK-LABEL: @wobble(
949; CHECK-NEXT:  bb:
950; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[ARG:%.*]], i32 0
951; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
952; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0
953; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
954; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]]
955; CHECK-NEXT:    [[X4:%.*]] = xor i32 [[ARG]], [[BAR]]
956; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer
957; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
958; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
959; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP7]], [[X4]]
960; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]]
961; CHECK-NEXT:    ret i32 [[OP_RDX1]]
962;
963; THRESHOLD-LABEL: @wobble(
964; THRESHOLD-NEXT:  bb:
965; THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[ARG:%.*]], i32 0
966; THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
967; THRESHOLD-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0
968; THRESHOLD-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
969; THRESHOLD-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]]
970; THRESHOLD-NEXT:    [[X4:%.*]] = xor i32 [[ARG]], [[BAR]]
971; THRESHOLD-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer
972; THRESHOLD-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
973; THRESHOLD-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
974; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP7]], [[X4]]
975; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]]
976; THRESHOLD-NEXT:    ret i32 [[OP_RDX1]]
977;
978  bb:
979  %x1 = xor i32 %arg, %bar
980  %i1 = icmp eq i32 %x1, 0
981  %s1 = sext i1 %i1 to i32
982  %x2 = xor i32 %arg, %bar
983  %i2 = icmp eq i32 %x2, 0
984  %s2 = sext i1 %i2 to i32
985  %x3 = xor i32 %arg, %bar
986  %i3 = icmp eq i32 %x3, 0
987  %s3 = sext i1 %i3 to i32
988  %x4 = xor i32 %arg, %bar
989  %i4 = icmp eq i32 %x4, 0
990  %s4 = sext i1 %i4 to i32
991  %r1 = add nuw i32 %arg, %s1
992  %r2 = add nsw i32 %r1, %s2
993  %r3 = add nsw i32 %r2, %s3
994  %r4 = add nsw i32 %r3, %s4
995  %r5 = add nsw i32 %r4, %x4
996  ret i32 %r5
997}
998
999