xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll (revision 38fffa630ee80163dc65e759392ad29798905679)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
3; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
4
5; #include <stdint.h>
6;
7; int foo(float *A, int n) {
8;   float sum = 0;
9;   for (intptr_t i=0; i < n; ++i) {
10;     sum += 7*A[i*4  ] +
11;            7*A[i*4+1] +
12;            7*A[i*4+2] +
13;            7*A[i*4+3];
14;   }
15;   return sum;
16; }
17
18define i32 @add_red(ptr %A, i32 %n) {
19; CHECK-LABEL: @add_red(
20; CHECK-NEXT:  entry:
21; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
22; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
23; CHECK:       for.body.lr.ph:
24; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
25; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
26; CHECK:       for.body:
27; CHECK-NEXT:    [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
28; CHECK-NEXT:    [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
29; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
30; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
31; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
32; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x float> [[TMP1]], splat (float 7.000000e+00)
33; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
34; CHECK-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP3]]
35; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
36; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
37; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
38; CHECK:       for.cond.for.end_crit_edge:
39; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
40; CHECK-NEXT:    br label [[FOR_END]]
41; CHECK:       for.end:
42; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
43; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
44;
45entry:
46  %cmp31 = icmp sgt i32 %n, 0
47  br i1 %cmp31, label %for.body.lr.ph, label %for.end
48
49for.body.lr.ph:
50  %0 = sext i32 %n to i64
51  br label %for.body
52
53for.body:
54  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
55  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
56  %mul = shl nsw i64 %i.033, 2
57  %arrayidx = getelementptr inbounds float, ptr %A, i64 %mul
58  %1 = load float, ptr %arrayidx, align 4
59  %mul2 = fmul float %1, 7.000000e+00
60  %add28 = or disjoint i64 %mul, 1
61  %arrayidx4 = getelementptr inbounds float, ptr %A, i64 %add28
62  %2 = load float, ptr %arrayidx4, align 4
63  %mul5 = fmul float %2, 7.000000e+00
64  %add6 = fadd fast float %mul2, %mul5
65  %add829 = or disjoint i64 %mul, 2
66  %arrayidx9 = getelementptr inbounds float, ptr %A, i64 %add829
67  %3 = load float, ptr %arrayidx9, align 4
68  %mul10 = fmul float %3, 7.000000e+00
69  %add11 = fadd fast float %add6, %mul10
70  %add1330 = or disjoint i64 %mul, 3
71  %arrayidx14 = getelementptr inbounds float, ptr %A, i64 %add1330
72  %4 = load float, ptr %arrayidx14, align 4
73  %mul15 = fmul float %4, 7.000000e+00
74  %add16 = fadd fast float %add11, %mul15
75  %add17 = fadd fast float %sum.032, %add16
76  %inc = add nsw i64 %i.033, 1
77  %exitcond = icmp eq i64 %inc, %0
78  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
79
80for.cond.for.end_crit_edge:
81  %phitmp = fptosi float %add17 to i32
82  br label %for.end
83
84for.end:
85  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
86  ret i32 %sum.0.lcssa
87}
88
89; int foo(float * restrict A, float * restrict B, int n) {
90;   float sum = 0;
91;   for (intptr_t i=0; i < n; ++i) {
92;     sum *= B[0]*A[i*4  ] +
93;       B[1]*A[i*4+1] +
94;       B[2]*A[i*4+2] +
95;       B[3]*A[i*4+3];
96;   }
97;   return sum;
98; }
99
100define i32 @mul_red(ptr noalias %A, ptr noalias %B, i32 %n) {
101; CHECK-LABEL: @mul_red(
102; CHECK-NEXT:  entry:
103; CHECK-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
104; CHECK-NEXT:    br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
105; CHECK:       for.body.lr.ph:
106; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
107; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[N]] to i64
108; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
109; CHECK:       for.body:
110; CHECK-NEXT:    [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
111; CHECK-NEXT:    [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
112; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
113; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
114; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4
115; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP0]], [[TMP2]]
116; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
117; CHECK-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP4]]
118; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
119; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]]
120; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
121; CHECK:       for.cond.for.end_crit_edge:
122; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
123; CHECK-NEXT:    br label [[FOR_END]]
124; CHECK:       for.end:
125; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
126; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
127;
128entry:
129  %cmp38 = icmp sgt i32 %n, 0
130  br i1 %cmp38, label %for.body.lr.ph, label %for.end
131
132for.body.lr.ph:
133  %0 = load float, ptr %B, align 4
134  %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1
135  %1 = load float, ptr %arrayidx4, align 4
136  %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2
137  %2 = load float, ptr %arrayidx9, align 4
138  %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3
139  %3 = load float, ptr %arrayidx15, align 4
140  %4 = sext i32 %n to i64
141  br label %for.body
142
143for.body:
144  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
145  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
146  %mul = shl nsw i64 %i.040, 2
147  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul
148  %5 = load float, ptr %arrayidx2, align 4
149  %mul3 = fmul float %0, %5
150  %add35 = or disjoint i64 %mul, 1
151  %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add35
152  %6 = load float, ptr %arrayidx6, align 4
153  %mul7 = fmul float %1, %6
154  %add8 = fadd fast float %mul3, %mul7
155  %add1136 = or disjoint i64 %mul, 2
156  %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add1136
157  %7 = load float, ptr %arrayidx12, align 4
158  %mul13 = fmul float %2, %7
159  %add14 = fadd fast float %add8, %mul13
160  %add1737 = or disjoint i64 %mul, 3
161  %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add1737
162  %8 = load float, ptr %arrayidx18, align 4
163  %mul19 = fmul float %3, %8
164  %add20 = fadd fast float %add14, %mul19
165  %mul21 = fmul float %sum.039, %add20
166  %inc = add nsw i64 %i.040, 1
167  %exitcond = icmp eq i64 %inc, %4
168  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
169
170for.cond.for.end_crit_edge:
171  %phitmp = fptosi float %mul21 to i32
172  br label %for.end
173
174for.end:
175  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
176  ret i32 %sum.0.lcssa
177}
178
179; int foo(float * restrict A, float * restrict B, int n) {
180;   float sum = 0;
181;   for (intptr_t i=0; i < n; ++i) {
182;     sum += B[0]*A[i*6  ] +
183;            B[1]*A[i*6+1] +
184;            B[2]*A[i*6+2] +
185;            B[3]*A[i*6+3] +
186;            B[4]*A[i*6+4] +
187;            B[5]*A[i*6+5] +
188;            B[6]*A[i*6+6] +
189;            B[7]*A[i*6+7] +
190;            B[8]*A[i*6+8];
191;   }
192;   return sum;
193; }
194
195define i32 @long_red(ptr noalias %A, ptr noalias %B, i32 %n) {
196; CHECK-LABEL: @long_red(
197; CHECK-NEXT:  entry:
198; CHECK-NEXT:    [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
199; CHECK-NEXT:    br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
200; CHECK:       for.body.lr.ph:
201; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[B:%.*]], align 4
202; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[B]], i64 8
203; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX45]], align 4
204; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
205; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
206; CHECK:       for.body:
207; CHECK-NEXT:    [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
208; CHECK-NEXT:    [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
209; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
210; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
211; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[ARRAYIDX2]], align 4
212; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <8 x float> [[TMP0]], [[TMP3]]
213; CHECK-NEXT:    [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
214; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[ADD47]]
215; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
216; CHECK-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
217; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP4]])
218; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP6]], [[MUL49]]
219; CHECK-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[OP_RDX]]
220; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
221; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
222; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
223; CHECK:       for.cond.for.end_crit_edge:
224; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
225; CHECK-NEXT:    br label [[FOR_END]]
226; CHECK:       for.end:
227; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
228; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
229;
230entry:
231  %cmp81 = icmp sgt i32 %n, 0
232  br i1 %cmp81, label %for.body.lr.ph, label %for.end
233
234for.body.lr.ph:
235  %0 = load float, ptr %B, align 4
236  %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1
237  %1 = load float, ptr %arrayidx4, align 4
238  %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2
239  %2 = load float, ptr %arrayidx9, align 4
240  %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3
241  %3 = load float, ptr %arrayidx15, align 4
242  %arrayidx21 = getelementptr inbounds float, ptr %B, i64 4
243  %4 = load float, ptr %arrayidx21, align 4
244  %arrayidx27 = getelementptr inbounds float, ptr %B, i64 5
245  %5 = load float, ptr %arrayidx27, align 4
246  %arrayidx33 = getelementptr inbounds float, ptr %B, i64 6
247  %6 = load float, ptr %arrayidx33, align 4
248  %arrayidx39 = getelementptr inbounds float, ptr %B, i64 7
249  %7 = load float, ptr %arrayidx39, align 4
250  %arrayidx45 = getelementptr inbounds float, ptr %B, i64 8
251  %8 = load float, ptr %arrayidx45, align 4
252  %9 = sext i32 %n to i64
253  br label %for.body
254
255for.body:
256  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
257  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
258  %mul = mul nsw i64 %i.083, 6
259  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul
260  %10 = load float, ptr %arrayidx2, align 4
261  %mul3 = fmul fast float %0, %10
262  %add80 = or disjoint i64 %mul, 1
263  %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add80
264  %11 = load float, ptr %arrayidx6, align 4
265  %mul7 = fmul fast float %1, %11
266  %add8 = fadd fast float %mul3, %mul7
267  %add11 = add nsw i64 %mul, 2
268  %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add11
269  %12 = load float, ptr %arrayidx12, align 4
270  %mul13 = fmul fast float %2, %12
271  %add14 = fadd fast float %add8, %mul13
272  %add17 = add nsw i64 %mul, 3
273  %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add17
274  %13 = load float, ptr %arrayidx18, align 4
275  %mul19 = fmul fast float %3, %13
276  %add20 = fadd fast float %add14, %mul19
277  %add23 = add nsw i64 %mul, 4
278  %arrayidx24 = getelementptr inbounds float, ptr %A, i64 %add23
279  %14 = load float, ptr %arrayidx24, align 4
280  %mul25 = fmul fast float %4, %14
281  %add26 = fadd fast float %add20, %mul25
282  %add29 = add nsw i64 %mul, 5
283  %arrayidx30 = getelementptr inbounds float, ptr %A, i64 %add29
284  %15 = load float, ptr %arrayidx30, align 4
285  %mul31 = fmul fast float %5, %15
286  %add32 = fadd fast float %add26, %mul31
287  %add35 = add nsw i64 %mul, 6
288  %arrayidx36 = getelementptr inbounds float, ptr %A, i64 %add35
289  %16 = load float, ptr %arrayidx36, align 4
290  %mul37 = fmul fast float %6, %16
291  %add38 = fadd fast float %add32, %mul37
292  %add41 = add nsw i64 %mul, 7
293  %arrayidx42 = getelementptr inbounds float, ptr %A, i64 %add41
294  %17 = load float, ptr %arrayidx42, align 4
295  %mul43 = fmul fast float %7, %17
296  %add44 = fadd fast float %add38, %mul43
297  %add47 = add nsw i64 %mul, 8
298  %arrayidx48 = getelementptr inbounds float, ptr %A, i64 %add47
299  %18 = load float, ptr %arrayidx48, align 4
300  %mul49 = fmul fast float %8, %18
301  %add50 = fadd fast float %add44, %mul49
302  %add51 = fadd fast float %sum.082, %add50
303  %inc = add nsw i64 %i.083, 1
304  %exitcond = icmp eq i64 %inc, %9
305  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
306
307for.cond.for.end_crit_edge:
308  %phitmp = fptosi float %add51 to i32
309  br label %for.end
310
311for.end:
312  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
313  ret i32 %sum.0.lcssa
314}
315
316; int foo(float * restrict A, float * restrict B, int n) {
317;   float sum = 0;
318;   for (intptr_t i=0; i < n; ++i) {
319;     sum += B[0]*A[i*4  ];
320;     sum += B[1]*A[i*4+1];
321;     sum += B[2]*A[i*4+2];
322;     sum += B[3]*A[i*4+3];
323;   }
324;   return sum;
325; }
326
327define i32 @chain_red(ptr noalias %A, ptr noalias %B, i32 %n) {
328; CHECK-LABEL: @chain_red(
329; CHECK-NEXT:  entry:
330; CHECK-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
331; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
332; CHECK:       for.body.lr.ph:
333; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
334; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[N]] to i64
335; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
336; CHECK:       for.body:
337; CHECK-NEXT:    [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
338; CHECK-NEXT:    [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
339; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
340; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
341; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4
342; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP0]], [[TMP2]]
343; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
344; CHECK-NEXT:    [[OP_RDX]] = fadd fast float [[TMP4]], [[SUM_042]]
345; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
346; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]]
347; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
348; CHECK:       for.cond.for.end_crit_edge:
349; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[OP_RDX]] to i32
350; CHECK-NEXT:    br label [[FOR_END]]
351; CHECK:       for.end:
352; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
353; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
354;
355entry:
356  %cmp41 = icmp sgt i32 %n, 0
357  br i1 %cmp41, label %for.body.lr.ph, label %for.end
358
359for.body.lr.ph:
360  %0 = load float, ptr %B, align 4
361  %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1
362  %1 = load float, ptr %arrayidx4, align 4
363  %arrayidx10 = getelementptr inbounds float, ptr %B, i64 2
364  %2 = load float, ptr %arrayidx10, align 4
365  %arrayidx16 = getelementptr inbounds float, ptr %B, i64 3
366  %3 = load float, ptr %arrayidx16, align 4
367  %4 = sext i32 %n to i64
368  br label %for.body
369
370for.body:
371  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
372  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
373  %mul = shl nsw i64 %i.043, 2
374  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul
375  %5 = load float, ptr %arrayidx2, align 4
376  %mul3 = fmul fast float %0, %5
377  %add = fadd fast float %sum.042, %mul3
378  %add638 = or disjoint i64 %mul, 1
379  %arrayidx7 = getelementptr inbounds float, ptr %A, i64 %add638
380  %6 = load float, ptr %arrayidx7, align 4
381  %mul8 = fmul fast float %1, %6
382  %add9 = fadd fast float %add, %mul8
383  %add1239 = or disjoint i64 %mul, 2
384  %arrayidx13 = getelementptr inbounds float, ptr %A, i64 %add1239
385  %7 = load float, ptr %arrayidx13, align 4
386  %mul14 = fmul fast float %2, %7
387  %add15 = fadd fast float %add9, %mul14
388  %add1840 = or disjoint i64 %mul, 3
389  %arrayidx19 = getelementptr inbounds float, ptr %A, i64 %add1840
390  %8 = load float, ptr %arrayidx19, align 4
391  %mul20 = fmul fast float %3, %8
392  %add21 = fadd fast float %add15, %mul20
393  %inc = add nsw i64 %i.043, 1
394  %exitcond = icmp eq i64 %inc, %4
395  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
396
397for.cond.for.end_crit_edge:
398  %phitmp = fptosi float %add21 to i32
399  br label %for.end
400
401for.end:
402  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
403  ret i32 %sum.0.lcssa
404}
405
406; void foo(const float *arg_A, unsigned arg_B, float *array) {
407;   for (uint32_t i = 0; i < 6; ++i) {
408;     const float *ptr = arg_A + i;
409;     float w0 = array[i * 4 + 0];
410;     float w1 = array[i * 4 + 1];
411;     float w2 = array[i * 4 + 2];
412;     float w3 = array[i * 4 + 3];
413;
414;     for (unsigned j = 0; j < arg_B; ++j) {
415;       const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
416;       const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
417;       const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
418;       const float x4 = x3 + (-4.0f * w2) + w3;
419;       w1 = w0;
420;       w0 = x1;
421;       w3 = w2;
422;       w2 = x3;
423;     }
424;
425;     array[i * 4 + 0] = w0;
426;     array[i * 4 + 1] = w1;
427;     array[i * 4 + 2] = w2;
428;     array[i * 4 + 3] = w3;
429;   }
430; }
431
432define void @foo(ptr nocapture readonly %arg_A, i32 %arg_B, ptr nocapture %array) {
433; CHECK-LABEL: @foo(
434; CHECK-NEXT:  entry:
435; CHECK-NEXT:    [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
436; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
437; CHECK:       for.cond.cleanup:
438; CHECK-NEXT:    ret void
439; CHECK:       for.body:
440; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
441; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
442; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[ARRAY:%.*]], i64 [[TMP0]]
443; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
444; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[TMP0]], 1
445; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP2]]
446; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
447; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP0]], 2
448; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP4]]
449; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
450; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 3
451; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP6]]
452; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
453; CHECK-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
454; CHECK:       for.body16.lr.ph:
455; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[ARG_A:%.*]], i64 [[INDVARS_IV]]
456; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ADD_PTR]], align 4
457; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
458; CHECK:       for.cond.cleanup15:
459; CHECK-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
460; CHECK-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
461; CHECK-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
462; CHECK-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
463; CHECK-NEXT:    store float [[W0_0_LCSSA]], ptr [[ARRAYIDX]], align 4
464; CHECK-NEXT:    store float [[W1_0_LCSSA]], ptr [[ARRAYIDX4]], align 4
465; CHECK-NEXT:    store float [[W2_0_LCSSA]], ptr [[ARRAYIDX8]], align 4
466; CHECK-NEXT:    store float [[W3_0_LCSSA]], ptr [[ARRAYIDX12]], align 4
467; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
468; CHECK-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
469; CHECK-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
470; CHECK:       for.body16:
471; CHECK-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
472; CHECK-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
473; CHECK-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
474; CHECK-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
475; CHECK-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
476; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
477; CHECK-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
478; CHECK-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
479; CHECK-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
480; CHECK-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
481; CHECK-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
482; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
483; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
484; CHECK-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
485; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
486; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
487; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
488; CHECK-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
489; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
490; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
491; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
492;
493entry:
494  %cmp1495 = icmp eq i32 %arg_B, 0
495  br label %for.body
496
497for.cond.cleanup:                                 ; preds = %for.cond.cleanup15
498  ret void
499
500for.body:                                         ; preds = %for.cond.cleanup15, %entry
501  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
502  %0 = shl i64 %indvars.iv, 2
503  %arrayidx = getelementptr inbounds float, ptr %array, i64 %0
504  %1 = load float, ptr %arrayidx, align 4
505  %2 = or disjoint i64 %0, 1
506  %arrayidx4 = getelementptr inbounds float, ptr %array, i64 %2
507  %3 = load float, ptr %arrayidx4, align 4
508  %4 = or disjoint i64 %0, 2
509  %arrayidx8 = getelementptr inbounds float, ptr %array, i64 %4
510  %5 = load float, ptr %arrayidx8, align 4
511  %6 = or disjoint i64 %0, 3
512  %arrayidx12 = getelementptr inbounds float, ptr %array, i64 %6
513  %7 = load float, ptr %arrayidx12, align 4
514  br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
515
516for.body16.lr.ph:                                 ; preds = %for.body
517  %add.ptr = getelementptr inbounds float, ptr %arg_A, i64 %indvars.iv
518  %8 = load float, ptr %add.ptr, align 4
519  br label %for.body16
520
521for.cond.cleanup15:                               ; preds = %for.body16, %for.body
522  %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
523  %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
524  %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
525  %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
526  store float %w0.0.lcssa, ptr %arrayidx, align 4
527  store float %w1.0.lcssa, ptr %arrayidx4, align 4
528  store float %w2.0.lcssa, ptr %arrayidx8, align 4
529  store float %w3.0.lcssa, ptr %arrayidx12, align 4
530  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
531  %exitcond109 = icmp eq i64 %indvars.iv.next, 6
532  br i1 %exitcond109, label %for.cond.cleanup, label %for.body
533
534for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
535  %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
536  %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
537  %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
538  %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
539  %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
540  %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
541  %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
542  %sub92 = fadd fast float %mul17, %mul18.neg
543  %sub19 = fadd fast float %sub92, %8
544  %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
545  %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
546  %mul23 = fmul fast float %w1.099, 0x4002666660000000
547  %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
548  %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
549  %add2293 = fadd fast float %mul27.neg, %mul25
550  %add24 = fadd fast float %add2293, %mul23
551  %sub2694 = fadd fast float %add24, %mul21.neg
552  %sub28 = fadd fast float %sub2694, %mul20
553  %inc = add nuw i32 %j.098, 1
554  %exitcond = icmp eq i32 %inc, %arg_B
555  br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
556}
557
558
559; void foo(double * restrict A, double * restrict B, double * restrict C,
560;          int n) {
561;   for (intptr_t i=0; i < n; ++i) {
562;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
563;   }
564; }
565
566define void @store_red_double(ptr noalias %A, ptr noalias %B, ptr noalias %C, i32 %n) {
567; CHECK-LABEL: @store_red_double(
568; CHECK-NEXT:  entry:
569; CHECK-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
570; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
571; CHECK:       for.body.lr.ph:
572; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
573; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[N]] to i64
574; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
575; CHECK:       for.body:
576; CHECK-NEXT:    [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
577; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
578; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[MUL]]
579; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 8
580; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x double> [[TMP0]], [[TMP2]]
581; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
582; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
583; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
584; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[C:%.*]], i64 [[I_018]]
585; CHECK-NEXT:    store double [[ADD8]], ptr [[ARRAYIDX9]], align 8
586; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_018]], 1
587; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]]
588; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
589; CHECK:       for.end:
590; CHECK-NEXT:    ret void
591;
592entry:
593  %cmp17 = icmp sgt i32 %n, 0
594  br i1 %cmp17, label %for.body.lr.ph, label %for.end
595
596for.body.lr.ph:
597  %0 = load double, ptr %B, align 8
598  %arrayidx4 = getelementptr inbounds double, ptr %B, i64 1
599  %1 = load double, ptr %arrayidx4, align 8
600  %2 = sext i32 %n to i64
601  br label %for.body
602
603for.body:
604  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
605  %mul = shl nsw i64 %i.018, 2
606  %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %mul
607  %3 = load double, ptr %arrayidx2, align 8
608  %mul3 = fmul fast double %0, %3
609  %add16 = or disjoint i64 %mul, 1
610  %arrayidx6 = getelementptr inbounds double, ptr %A, i64 %add16
611  %4 = load double, ptr %arrayidx6, align 8
612  %mul7 = fmul fast double %1, %4
613  %add8 = fadd fast double %mul3, %mul7
614  %arrayidx9 = getelementptr inbounds double, ptr %C, i64 %i.018
615  store double %add8, ptr %arrayidx9, align 8
616  %inc = add nsw i64 %i.018, 1
617  %exitcond = icmp eq i64 %inc, %2
618  br i1 %exitcond, label %for.end, label %for.body
619
620for.end:
621  ret void
622}
623
624; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
625;   float sum = 0;
626;   for (intptr_t i=0; i < n; ++i) {
627;     C[i] = B[0] *A[i*4  ] +
628;          B[1] *A[i*4+1] +
629;          B[2] *A[i*4+2] +
630;          B[3] *A[i*4+3];
631;   }
632;   return sum;
633; }
634
635define i32 @store_red(ptr noalias %A, ptr noalias %B, ptr noalias %C, i32 %n) {
636; CHECK-LABEL: @store_red(
637; CHECK-NEXT:  entry:
638; CHECK-NEXT:    [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
639; CHECK-NEXT:    br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
640; CHECK:       for.body.lr.ph:
641; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
642; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
643; CHECK:       for.body:
644; CHECK-NEXT:    [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
645; CHECK-NEXT:    [[C_ADDR_038:%.*]] = phi ptr [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
646; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
647; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
648; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
649; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4
650; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP2]]
651; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
652; CHECK-NEXT:    store float [[TMP4]], ptr [[C_ADDR_038]], align 4
653; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[C_ADDR_038]], i64 1
654; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
655; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
656; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
657; CHECK:       for.end:
658; CHECK-NEXT:    ret i32 0
659;
660entry:
661  %cmp37 = icmp sgt i32 %n, 0
662  br i1 %cmp37, label %for.body.lr.ph, label %for.end
663
664for.body.lr.ph:
665  %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1
666  %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2
667  %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3
668  %0 = sext i32 %n to i64
669  br label %for.body
670
671for.body:
672  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
673  %C.addr.038 = phi ptr [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
674  %1 = load float, ptr %B, align 4
675  %mul = shl nsw i64 %i.039, 2
676  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul
677  %2 = load float, ptr %arrayidx2, align 4
678  %mul3 = fmul fast float %1, %2
679  %3 = load float, ptr %arrayidx4, align 4
680  %add34 = or disjoint i64 %mul, 1
681  %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add34
682  %4 = load float, ptr %arrayidx6, align 4
683  %mul7 = fmul fast float %3, %4
684  %add8 = fadd fast float %mul3, %mul7
685  %5 = load float, ptr %arrayidx9, align 4
686  %add1135 = or disjoint i64 %mul, 2
687  %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add1135
688  %6 = load float, ptr %arrayidx12, align 4
689  %mul13 = fmul fast float %5, %6
690  %add14 = fadd fast float %add8, %mul13
691  %7 = load float, ptr %arrayidx15, align 4
692  %add1736 = or disjoint i64 %mul, 3
693  %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add1736
694  %8 = load float, ptr %arrayidx18, align 4
695  %mul19 = fmul fast float %7, %8
696  %add20 = fadd fast float %add14, %mul19
697  store float %add20, ptr %C.addr.038, align 4
698  %incdec.ptr = getelementptr inbounds float, ptr %C.addr.038, i64 1
699  %inc = add nsw i64 %i.039, 1
700  %exitcond = icmp eq i64 %inc, %0
701  br i1 %exitcond, label %for.end, label %for.body
702
703for.end:
704  ret i32 0
705}
706
707@arr_i32 = global [32 x i32] zeroinitializer, align 16
708@arr_float = global [32 x float] zeroinitializer, align 16
709
710define void @float_red_example4(ptr %res) {
711; CHECK-LABEL: @float_red_example4(
712; CHECK-NEXT:  entry:
713; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr @arr_float, align 16
714; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
715; CHECK-NEXT:    store float [[TMP1]], ptr [[RES:%.*]], align 16
716; CHECK-NEXT:    ret void
717;
718entry:
719  %0 = load float, ptr @arr_float, align 16
720  %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4
721  %add = fadd fast float %1, %0
722  %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8
723  %add.1 = fadd fast float %2, %add
724  %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4
725  %add.2 = fadd fast float %3, %add.1
726  store float %add.2, ptr %res, align 16
727  ret void
728}
729
730define void @float_red_example8(ptr %res) {
731; CHECK-LABEL: @float_red_example8(
732; CHECK-NEXT:  entry:
733; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr @arr_float, align 16
734; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
735; CHECK-NEXT:    store float [[TMP1]], ptr [[RES:%.*]], align 16
736; CHECK-NEXT:    ret void
737;
738entry:
739  %0 = load float, ptr @arr_float, align 16
740  %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4
741  %add = fadd fast float %1, %0
742  %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8
743  %add.1 = fadd fast float %2, %add
744  %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4
745  %add.2 = fadd fast float %3, %add.1
746  %4 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 4), align 16
747  %add.3 = fadd fast float %4, %add.2
748  %5 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 5), align 4
749  %add.4 = fadd fast float %5, %add.3
750  %6 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 6), align 8
751  %add.5 = fadd fast float %6, %add.4
752  %7 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 7), align 4
753  %add.6 = fadd fast float %7, %add.5
754  store float %add.6, ptr %res, align 16
755  ret void
756}
757
758define void @float_red_example16(ptr %res) {
759; CHECK-LABEL: @float_red_example16(
760; CHECK-NEXT:  entry:
761; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr @arr_float, align 16
762; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
763; CHECK-NEXT:    store float [[TMP1]], ptr [[RES:%.*]], align 16
764; CHECK-NEXT:    ret void
765;
766entry:
767  %0 = load float, ptr @arr_float, align 16
768  %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4
769  %add = fadd fast float %1, %0
770  %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8
771  %add.1 = fadd fast float %2, %add
772  %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4
773  %add.2 = fadd fast float %3, %add.1
774  %4 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 4), align 16
775  %add.3 = fadd fast float %4, %add.2
776  %5 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 5), align 4
777  %add.4 = fadd fast float %5, %add.3
778  %6 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 6), align 8
779  %add.5 = fadd fast float %6, %add.4
780  %7 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 7), align 4
781  %add.6 = fadd fast float %7, %add.5
782  %8 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 8), align 16
783  %add.7 = fadd fast float %8, %add.6
784  %9 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 9), align 4
785  %add.8 = fadd fast float %9, %add.7
786  %10 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 10), align 8
787  %add.9 = fadd fast float %10, %add.8
788  %11 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 11), align 4
789  %add.10 = fadd fast float %11, %add.9
790  %12 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 12), align 16
791  %add.11 = fadd fast float %12, %add.10
792  %13 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 13), align 4
793  %add.12 = fadd fast float %13, %add.11
794  %14 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 14), align 8
795  %add.13 = fadd fast float %14, %add.12
796  %15 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 15), align 4
797  %add.14 = fadd fast float %15, %add.13
798  store float %add.14, ptr %res, align 16
799  ret void
800}
801
802define void @i32_red_example4(ptr %res) {
803; CHECK-LABEL: @i32_red_example4(
804; CHECK-NEXT:  entry:
805; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @arr_i32, align 16
806; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
807; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES:%.*]], align 16
808; CHECK-NEXT:    ret void
809;
810entry:
811  %0 = load i32, ptr @arr_i32, align 16
812  %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
813  %add = add nsw i32 %1, %0
814  %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
815  %add.1 = add nsw i32 %2, %add
816  %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
817  %add.2 = add nsw i32 %3, %add.1
818  store i32 %add.2, ptr %res, align 16
819  ret void
820}
821
822define void @i32_red_example8(ptr %res) {
823; CHECK-LABEL: @i32_red_example8(
824; CHECK-NEXT:  entry:
825; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16
826; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
827; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES:%.*]], align 16
828; CHECK-NEXT:    ret void
829;
830entry:
831  %0 = load i32, ptr @arr_i32, align 16
832  %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
833  %add = add nsw i32 %1, %0
834  %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
835  %add.1 = add nsw i32 %2, %add
836  %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
837  %add.2 = add nsw i32 %3, %add.1
838  %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
839  %add.3 = add nsw i32 %4, %add.2
840  %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
841  %add.4 = add nsw i32 %5, %add.3
842  %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
843  %add.5 = add nsw i32 %6, %add.4
844  %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
845  %add.6 = add nsw i32 %7, %add.5
846  store i32 %add.6, ptr %res, align 16
847  ret void
848}
849
850define void @i32_red_example16(ptr %res) {
851; CHECK-LABEL: @i32_red_example16(
852; CHECK-NEXT:  entry:
853; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @arr_i32, align 16
854; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
855; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES:%.*]], align 16
856; CHECK-NEXT:    ret void
857;
858entry:
859  %0 = load i32, ptr @arr_i32, align 16
860  %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
861  %add = add nsw i32 %1, %0
862  %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
863  %add.1 = add nsw i32 %2, %add
864  %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
865  %add.2 = add nsw i32 %3, %add.1
866  %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
867  %add.3 = add nsw i32 %4, %add.2
868  %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
869  %add.4 = add nsw i32 %5, %add.3
870  %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
871  %add.5 = add nsw i32 %6, %add.4
872  %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
873  %add.6 = add nsw i32 %7, %add.5
874  %8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 8), align 16
875  %add.7 = add nsw i32 %8, %add.6
876  %9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 9), align 4
877  %add.8 = add nsw i32 %9, %add.7
878  %10 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 10), align 8
879  %add.9 = add nsw i32 %10, %add.8
880  %11 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 11), align 4
881  %add.10 = add nsw i32 %11, %add.9
882  %12 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 12), align 16
883  %add.11 = add nsw i32 %12, %add.10
884  %13 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 13), align 4
885  %add.12 = add nsw i32 %13, %add.11
886  %14 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 14), align 8
887  %add.13 = add nsw i32 %14, %add.12
888  %15 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 15), align 4
889  %add.14 = add nsw i32 %15, %add.13
890  store i32 %add.14, ptr %res, align 16
891  ret void
892}
893
894define void @i32_red_example32(ptr %res) {
895; CHECK-LABEL: @i32_red_example32(
896; CHECK-NEXT:  entry:
897; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i32>, ptr @arr_i32, align 16
898; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
899; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES:%.*]], align 16
900; CHECK-NEXT:    ret void
901;
902entry:
903  %0 = load i32, ptr @arr_i32, align 16
904  %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
905  %add = add nsw i32 %1, %0
906  %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
907  %add.1 = add nsw i32 %2, %add
908  %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
909  %add.2 = add nsw i32 %3, %add.1
910  %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
911  %add.3 = add nsw i32 %4, %add.2
912  %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
913  %add.4 = add nsw i32 %5, %add.3
914  %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
915  %add.5 = add nsw i32 %6, %add.4
916  %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
917  %add.6 = add nsw i32 %7, %add.5
918  %8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 8), align 16
919  %add.7 = add nsw i32 %8, %add.6
920  %9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 9), align 4
921  %add.8 = add nsw i32 %9, %add.7
922  %10 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 10), align 8
923  %add.9 = add nsw i32 %10, %add.8
924  %11 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 11), align 4
925  %add.10 = add nsw i32 %11, %add.9
926  %12 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 12), align 16
927  %add.11 = add nsw i32 %12, %add.10
928  %13 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 13), align 4
929  %add.12 = add nsw i32 %13, %add.11
930  %14 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 14), align 8
931  %add.13 = add nsw i32 %14, %add.12
932  %15 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 15), align 4
933  %add.14 = add nsw i32 %15, %add.13
934  %16 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 16), align 16
935  %add.15 = add nsw i32 %16, %add.14
936  %17 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 17), align 4
937  %add.16 = add nsw i32 %17, %add.15
938  %18 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 18), align 8
939  %add.17 = add nsw i32 %18, %add.16
940  %19 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 19), align 4
941  %add.18 = add nsw i32 %19, %add.17
942  %20 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 20), align 16
943  %add.19 = add nsw i32 %20, %add.18
944  %21 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 21), align 4
945  %add.20 = add nsw i32 %21, %add.19
946  %22 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 22), align 8
947  %add.21 = add nsw i32 %22, %add.20
948  %23 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 23), align 4
949  %add.22 = add nsw i32 %23, %add.21
950  %24 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 24), align 16
951  %add.23 = add nsw i32 %24, %add.22
952  %25 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 25), align 4
953  %add.24 = add nsw i32 %25, %add.23
954  %26 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 26), align 8
955  %add.25 = add nsw i32 %26, %add.24
956  %27 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 27), align 4
957  %add.26 = add nsw i32 %27, %add.25
958  %28 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 28), align 16
959  %add.27 = add nsw i32 %28, %add.26
960  %29 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 29), align 4
961  %add.28 = add nsw i32 %29, %add.27
962  %30 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 30), align 8
963  %add.29 = add nsw i32 %30, %add.28
964  %31 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 31), align 4
965  %add.30 = add nsw i32 %31, %add.29
966  store i32 %add.30, ptr %res, align 16
967  ret void
968}
969
970declare i32 @foobar(i32)
971
972define void @i32_red_call(i32 %val) {
973; CHECK-LABEL: @i32_red_call(
974; CHECK-NEXT:  entry:
975; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16
976; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
977; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
978; CHECK-NEXT:    ret void
979;
980entry:
981  %0 = load i32, ptr @arr_i32, align 16
982  %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
983  %add = add nsw i32 %1, %0
984  %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
985  %add.1 = add nsw i32 %2, %add
986  %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
987  %add.2 = add nsw i32 %3, %add.1
988  %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
989  %add.3 = add nsw i32 %4, %add.2
990  %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
991  %add.4 = add nsw i32 %5, %add.3
992  %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
993  %add.5 = add nsw i32 %6, %add.4
994  %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
995  %add.6 = add nsw i32 %7, %add.5
996  %res = call i32 @foobar(i32 %add.6)
997  ret void
998}
999
1000define void @i32_red_invoke(i32 %val) personality ptr @__gxx_personality_v0 {
1001; CHECK-LABEL: @i32_red_invoke(
1002; CHECK-NEXT:  entry:
1003; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16
1004; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1005; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
1006; CHECK-NEXT:            to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
1007; CHECK:       exception:
1008; CHECK-NEXT:    [[CLEANUP:%.*]] = landingpad i8
1009; CHECK-NEXT:            cleanup
1010; CHECK-NEXT:    br label [[NORMAL]]
1011; CHECK:       normal:
1012; CHECK-NEXT:    ret void
1013;
1014entry:
1015  %0 = load i32, ptr @arr_i32, align 16
1016  %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
1017  %add = add nsw i32 %1, %0
1018  %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
1019  %add.1 = add nsw i32 %2, %add
1020  %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
1021  %add.2 = add nsw i32 %3, %add.1
1022  %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
1023  %add.3 = add nsw i32 %4, %add.2
1024  %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
1025  %add.4 = add nsw i32 %5, %add.3
1026  %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
1027  %add.5 = add nsw i32 %6, %add.4
1028  %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
1029  %add.6 = add nsw i32 %7, %add.5
1030  %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception
1031exception:
1032  %cleanup = landingpad i8 cleanup
1033  br label %normal
1034normal:
1035  ret void
1036}
1037
1038; Test case from PR47670. Reduction result is used as incoming value in phi.
1039define i32 @reduction_result_used_in_phi(ptr nocapture readonly %data, i1 zeroext %b) {
1040; CHECK-LABEL: @reduction_result_used_in_phi(
1041; CHECK-NEXT:  entry:
1042; CHECK-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1043; CHECK:       bb:
1044; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[DATA:%.*]], align 4
1045; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
1046; CHECK-NEXT:    br label [[EXIT]]
1047; CHECK:       exit:
1048; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[BB]] ]
1049; CHECK-NEXT:    ret i32 [[SUM_1]]
1050;
1051entry:
1052  br i1 %b, label %bb, label %exit
1053
1054bb:
1055  %l.0 = load i32, ptr %data, align 4
1056  %idx.1 = getelementptr inbounds i32, ptr %data, i64 1
1057  %l.1 = load i32, ptr %idx.1, align 4
1058  %add.1 = add i32 %l.1, %l.0
1059  %idx.2 = getelementptr inbounds i32, ptr %data, i64 2
1060  %l.2 = load i32, ptr %idx.2, align 4
1061  %add.2 = add i32 %l.2, %add.1
1062  %idx.3 = getelementptr inbounds i32, ptr %data, i64 3
1063  %l.3 = load i32, ptr %idx.3, align 4
1064  %add.3 = add i32 %l.3, %add.2
1065  br label %exit
1066
1067exit:
1068  %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1069  ret i32 %sum.1
1070}
1071
1072define i32 @reduction_result_used_in_phi_loop(ptr nocapture readonly %data, i1 zeroext %b) {
1073; CHECK-LABEL: @reduction_result_used_in_phi_loop(
1074; CHECK-NEXT:  entry:
1075; CHECK-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1076; CHECK:       bb:
1077; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[DATA:%.*]], align 4
1078; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
1079; CHECK-NEXT:    br label [[EXIT]]
1080; CHECK:       exit:
1081; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[BB]] ]
1082; CHECK-NEXT:    ret i32 [[SUM_1]]
1083;
1084entry:
1085  br i1 %b, label %bb, label %exit
1086
1087bb:
1088  %l.0 = load i32, ptr %data, align 4
1089  %idx.1 = getelementptr inbounds i32, ptr %data, i64 1
1090  %l.1 = load i32, ptr %idx.1, align 4
1091  %add.1 = add i32 %l.1, %l.0
1092  %idx.2 = getelementptr inbounds i32, ptr %data, i64 2
1093  %l.2 = load i32, ptr %idx.2, align 4
1094  %add.2 = add i32 %l.2, %add.1
1095  %idx.3 = getelementptr inbounds i32, ptr %data, i64 3
1096  %l.3 = load i32, ptr %idx.3, align 4
1097  %add.3 = add i32 %l.3, %add.2
1098  br label %exit
1099
1100exit:
1101  %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1102  ret i32 %sum.1
1103}
1104
1105; Make sure we do not crash or infinite loop on ill-formed IR.
1106
1107define void @unreachable_block() {
1108; CHECK-LABEL: @unreachable_block(
1109; CHECK-NEXT:  bb.0:
1110; CHECK-NEXT:    br label [[BB_1:%.*]]
1111; CHECK:       dead:
1112; CHECK-NEXT:    [[T0:%.*]] = add i16 [[T0]], undef
1113; CHECK-NEXT:    br label [[BB_1]]
1114; CHECK:       bb.1:
1115; CHECK-NEXT:    [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ]
1116; CHECK-NEXT:    ret void
1117;
1118bb.0:
1119  br label %bb.1
1120
1121dead:
1122  %t0 = add i16 %t0, undef ; unreachable IR may depend on itself
1123  br label %bb.1
1124
1125bb.1:
1126  %t1 = phi i16 [ undef, %bb.0 ], [ %t0, %dead ]
1127  ret void
1128}
1129
1130; The FMF on the reduction should match the incoming insts.
1131
1132define float @fadd_v4f32_fmf(ptr %p) {
1133; CHECK-LABEL: @fadd_v4f32_fmf(
1134; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
1135; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP1]])
1136; CHECK-NEXT:    ret float [[TMP2]]
1137;
1138  %p1 = getelementptr inbounds float, ptr %p, i64 1
1139  %p2 = getelementptr inbounds float, ptr %p, i64 2
1140  %p3 = getelementptr inbounds float, ptr %p, i64 3
1141  %t0 = load float, ptr %p, align 4
1142  %t1 = load float, ptr %p1, align 4
1143  %t2 = load float, ptr %p2, align 4
1144  %t3 = load float, ptr %p3, align 4
1145  %add1 = fadd reassoc nsz float %t1, %t0
1146  %add2 = fadd reassoc nsz float %t2, %add1
1147  %add3 = fadd reassoc nsz float %t3, %add2
1148  ret float %add3
1149}
1150
1151; The minimal FMF for fadd reduction are "reassoc nsz".
1152; Only the common FMF of all operations in the reduction propagate to the result.
1153; In this example, "contract nnan arcp" are dropped, but "ninf" transfers with the required flags.
1154
1155define float @fadd_v4f32_fmf_intersect(ptr %p) {
1156; CHECK-LABEL: @fadd_v4f32_fmf_intersect(
1157; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
1158; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP1]])
1159; CHECK-NEXT:    ret float [[TMP2]]
1160;
1161  %p1 = getelementptr inbounds float, ptr %p, i64 1
1162  %p2 = getelementptr inbounds float, ptr %p, i64 2
1163  %p3 = getelementptr inbounds float, ptr %p, i64 3
1164  %t0 = load float, ptr %p, align 4
1165  %t1 = load float, ptr %p1, align 4
1166  %t2 = load float, ptr %p2, align 4
1167  %t3 = load float, ptr %p3, align 4
1168  %add1 = fadd ninf reassoc nsz nnan float %t1, %t0
1169  %add2 = fadd ninf reassoc nsz nnan arcp float %t2, %add1
1170  %add3 = fadd ninf reassoc nsz contract float %t3, %add2
1171  ret float %add3
1172}
1173
1174; This must not propagate 'nsw' to a new add instruction.
1175
1176define void @nsw_propagation_v4i32(ptr %res, i32 %start) {
1177; CHECK-LABEL: @nsw_propagation_v4i32(
1178; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @arr_i32, align 16
1179; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1180; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP2]], [[START:%.*]]
1181; CHECK-NEXT:    store i32 [[OP_RDX]], ptr [[RES:%.*]], align 16
1182; CHECK-NEXT:    ret void
1183;
1184
1185; STORE-LABEL: @nsw_propagation_v4i32(
1186; STORE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @arr_i32, align 16
1187; STORE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1188; STORE-NEXT:    [[OP_RDX:%.*]] = add i32 [[START:%.*]], [[TMP2]]
1189; STORE-NEXT:    store i32 [[OP_RDX]], ptr [[RES:%.*]], align 16
1190; STORE-NEXT:    ret void
1191  %t0 = load i32, ptr @arr_i32, align 16
1192  %t1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
1193  %t2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
1194  %t3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
1195  %s = add nsw i32 %start, %t0
1196  %add = add nsw i32 %t1, %s
1197  %add.1 = add nsw i32 %t2, %add
1198  %add.2 = add nsw i32 %t3, %add.1
1199  store i32 %add.2, ptr %res, align 16
1200  ret void
1201}
1202
1203declare i32 @__gxx_personality_v0(...)
1204