xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/addsub.ll (revision 15ee17c3ce34623261788d7de3c1bdf5860be34e)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
3; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
4
5@b = common global [4 x i32] zeroinitializer, align 16
6@c = common global [4 x i32] zeroinitializer, align 16
7@d = common global [4 x i32] zeroinitializer, align 16
8@e = common global [4 x i32] zeroinitializer, align 16
9@a = common global [4 x i32] zeroinitializer, align 16
10@fb = common global [4 x float] zeroinitializer, align 16
11@fc = common global [4 x float] zeroinitializer, align 16
12@fa = common global [4 x float] zeroinitializer, align 16
13@fd = common global [4 x float] zeroinitializer, align 16
14
15; Function Attrs: nounwind uwtable
16define void @addsub() #0 {
17; CHECK-LABEL: @addsub(
18; CHECK-NEXT:  entry:
19; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
20; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @c, align 4
21; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
22; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @d, align 4
23; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr @e, align 4
24; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
25; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
26; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
27; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
28; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr @a, align 4
29; CHECK-NEXT:    ret void
30;
31entry:
32  %0 = load i32, ptr @b, align 4
33  %1 = load i32, ptr @c, align 4
34  %add = add nsw i32 %0, %1
35  %2 = load i32, ptr @d, align 4
36  %3 = load i32, ptr @e, align 4
37  %add1 = add nsw i32 %2, %3
38  %add2 = add nsw i32 %add, %add1
39  store i32 %add2, ptr @a, align 4
40  %4 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 1), align 4
41  %5 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 1), align 4
42  %add3 = add nsw i32 %4, %5
43  %6 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 1), align 4
44  %7 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 1), align 4
45  %add4 = add nsw i32 %6, %7
46  %sub = sub nsw i32 %add3, %add4
47  store i32 %sub, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 1), align 4
48  %8 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 2), align 4
49  %9 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 2), align 4
50  %add5 = add nsw i32 %8, %9
51  %10 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 2), align 4
52  %11 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 2), align 4
53  %add6 = add nsw i32 %10, %11
54  %add7 = add nsw i32 %add5, %add6
55  store i32 %add7, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 2), align 4
56  %12 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 3), align 4
57  %13 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 3), align 4
58  %add8 = add nsw i32 %12, %13
59  %14 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 3), align 4
60  %15 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 3), align 4
61  %add9 = add nsw i32 %14, %15
62  %sub10 = sub nsw i32 %add8, %add9
63  store i32 %sub10, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 3), align 4
64  ret void
65}
66
67define void @addsub_freeze() #0 {
68; CHECK-LABEL: @addsub_freeze(
69; CHECK-NEXT:  entry:
70; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
71; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @c, align 4
72; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
73; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @d, align 4
74; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr @e, align 4
75; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
76; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
77; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
78; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
79; CHECK-NEXT:    [[TMP9:%.*]] = freeze <4 x i32> [[TMP8]]
80; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr @a, align 4
81; CHECK-NEXT:    ret void
82;
83entry:
84  %0 = load i32, ptr @b, align 4
85  %1 = load i32, ptr @c, align 4
86  %add = add nsw i32 %0, %1
87  %2 = load i32, ptr @d, align 4
88  %3 = load i32, ptr @e, align 4
89  %add1 = add nsw i32 %2, %3
90  %add2 = add nsw i32 %add, %add1
91  %freeze.add2 = freeze i32 %add2
92  store i32 %freeze.add2, ptr @a, align 4
93  %4 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 1), align 4
94  %5 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 1), align 4
95  %add3 = add nsw i32 %4, %5
96  %6 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 1), align 4
97  %7 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 1), align 4
98  %add4 = add nsw i32 %6, %7
99  %sub = sub nsw i32 %add3, %add4
100  %freeze.sub = freeze i32 %sub
101  store i32 %freeze.sub, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 1), align 4
102  %8 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 2), align 4
103  %9 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 2), align 4
104  %add5 = add nsw i32 %8, %9
105  %10 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 2), align 4
106  %11 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 2), align 4
107  %add6 = add nsw i32 %10, %11
108  %add7 = add nsw i32 %add5, %add6
109  %freeze.add7 = freeze i32 %add7
110  store i32 %freeze.add7, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 2), align 4
111  %12 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 3), align 4
112  %13 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 3), align 4
113  %add8 = add nsw i32 %12, %13
114  %14 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 3), align 4
115  %15 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 3), align 4
116  %add9 = add nsw i32 %14, %15
117  %sub10 = sub nsw i32 %add8, %add9
118  %freeze.sub10 = freeze i32 %sub10
119  store i32 %freeze.sub10, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 3), align 4
120  ret void
121}
122
123; Function Attrs: nounwind uwtable
124define void @subadd() #0 {
125; CHECK-LABEL: @subadd(
126; CHECK-NEXT:  entry:
127; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
128; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @c, align 4
129; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
130; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @d, align 4
131; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr @e, align 4
132; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
133; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
134; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
135; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
136; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr @a, align 4
137; CHECK-NEXT:    ret void
138;
139entry:
140  %0 = load i32, ptr @b, align 4
141  %1 = load i32, ptr @c, align 4
142  %add = add nsw i32 %0, %1
143  %2 = load i32, ptr @d, align 4
144  %3 = load i32, ptr @e, align 4
145  %add1 = add nsw i32 %2, %3
146  %sub = sub nsw i32 %add, %add1
147  store i32 %sub, ptr @a, align 4
148  %4 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 1), align 4
149  %5 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 1), align 4
150  %add2 = add nsw i32 %4, %5
151  %6 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 1), align 4
152  %7 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 1), align 4
153  %add3 = add nsw i32 %6, %7
154  %add4 = add nsw i32 %add2, %add3
155  store i32 %add4, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 1), align 4
156  %8 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 2), align 4
157  %9 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 2), align 4
158  %add5 = add nsw i32 %8, %9
159  %10 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 2), align 4
160  %11 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 2), align 4
161  %add6 = add nsw i32 %10, %11
162  %sub7 = sub nsw i32 %add5, %add6
163  store i32 %sub7, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 2), align 4
164  %12 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 3), align 4
165  %13 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 3), align 4
166  %add8 = add nsw i32 %12, %13
167  %14 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 3), align 4
168  %15 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 3), align 4
169  %add9 = add nsw i32 %14, %15
170  %add10 = add nsw i32 %add8, %add9
171  store i32 %add10, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 3), align 4
172  ret void
173}
174
175; Function Attrs: nounwind uwtable
176define void @faddfsub() #0 {
177; CHECK-LABEL: @faddfsub(
178; CHECK-NEXT:  entry:
179; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr @fb, align 4
180; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @fc, align 4
181; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
182; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
183; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
184; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr @fa, align 4
185; CHECK-NEXT:    ret void
186;
187entry:
188  %0 = load float, ptr @fb, align 4
189  %1 = load float, ptr @fc, align 4
190  %add = fadd float %0, %1
191  store float %add, ptr @fa, align 4
192  %2 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
193  %3 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 1), align 4
194  %sub = fsub float %2, %3
195  store float %sub, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
196  %4 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
197  %5 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 2), align 4
198  %add1 = fadd float %4, %5
199  store float %add1, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
200  %6 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 3), align 4
201  %7 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 3), align 4
202  %sub2 = fsub float %6, %7
203  store float %sub2, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 3), align 4
204  ret void
205}
206
207; Function Attrs: nounwind uwtable
208define void @fsubfadd() #0 {
209; CHECK-LABEL: @fsubfadd(
210; CHECK-NEXT:  entry:
211; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr @fb, align 4
212; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @fc, align 4
213; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
214; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
215; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
216; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr @fa, align 4
217; CHECK-NEXT:    ret void
218;
219entry:
220  %0 = load float, ptr @fb, align 4
221  %1 = load float, ptr @fc, align 4
222  %sub = fsub float %0, %1
223  store float %sub, ptr @fa, align 4
224  %2 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
225  %3 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 1), align 4
226  %add = fadd float %2, %3
227  store float %add, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
228  %4 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
229  %5 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 2), align 4
230  %sub1 = fsub float %4, %5
231  store float %sub1, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
232  %6 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 3), align 4
233  %7 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 3), align 4
234  %add2 = fadd float %6, %7
235  store float %add2, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 3), align 4
236  ret void
237}
238
239; Function Attrs: nounwind uwtable
240define void @faddfsub_select() #0 {
241; CHECK-LABEL: @faddfsub_select(
242; CHECK-NEXT:  entry:
243; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr @fb, align 4
244; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @fc, align 4
245; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
246; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
247; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
248; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr @fa, align 4
249; CHECK-NEXT:    ret void
250;
251entry:
252  %0 = load float, ptr @fb, align 4
253  %1 = load float, ptr @fc, align 4
254  %add = fadd float %0, %1
255  store float %add, ptr @fa, align 4
256  %2 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
257  %3 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 1), align 4
258  %add1 = fadd float %2, %3
259  store float %add1, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
260  %4 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
261  %5 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 2), align 4
262  %add2 = fadd float %4, %5
263  store float %add2, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
264  %6 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 3), align 4
265  %7 = load float, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 3), align 4
266  %sub = fsub float %6, %7
267  store float %sub, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 3), align 4
268  ret void
269}
270
271; Check vectorization of following code for float data type-
272;  fc[0] = fb[0]+fa[0]; //swapped fb and fa
273;  fc[1] = fa[1]-fb[1];
274;  fc[2] = fa[2]+fb[2];
275;  fc[3] = fa[3]-fb[3];
276
277define void @reorder_alt() #0 {
278; CHECK-LABEL: @reorder_alt(
279; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @fa, align 4
280; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @fb, align 4
281; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
282; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
283; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
284; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr @fc, align 4
285; CHECK-NEXT:    ret void
286;
287  %1 = load float, ptr @fb, align 4
288  %2 = load float, ptr @fa, align 4
289  %3 = fadd float %1, %2
290  store float %3, ptr @fc, align 4
291  %4 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
292  %5 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
293  %6 = fsub float %4, %5
294  store float %6, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 1), align 4
295  %7 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
296  %8 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
297  %9 = fadd float %7, %8
298  store float %9, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 2), align 4
299  %10 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 3), align 4
300  %11 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 3), align 4
301  %12 = fsub float %10, %11
302  store float %12, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 3), align 4
303  ret void
304}
305
306; Check vectorization of following code for float data type-
307;  fc[0] = fa[0]+(fb[0]-fd[0]);
308;  fc[1] = fa[1]-(fb[1]+fd[1]);
309;  fc[2] = fa[2]+(fb[2]-fd[2]);
310;  fc[3] = fa[3]-(fd[3]+fb[3]); //swapped fd and fb
311
312define void @reorder_alt_subTree() #0 {
313; CHECK-LABEL: @reorder_alt_subTree(
314; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @fa, align 4
315; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @fd, align 4
316; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr @fb, align 4
317; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP3]], [[TMP2]]
318; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP2]]
319; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
320; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP1]], [[TMP6]]
321; CHECK-NEXT:    [[TMP8:%.*]] = fsub <4 x float> [[TMP1]], [[TMP6]]
322; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
323; CHECK-NEXT:    store <4 x float> [[TMP9]], ptr @fc, align 4
324; CHECK-NEXT:    ret void
325;
326  %1 = load float, ptr @fa, align 4
327  %2 = load float, ptr @fb, align 4
328  %3 = load float, ptr @fd, align 4
329  %4 = fsub float %2, %3
330  %5 = fadd float %1, %4
331  store float %5, ptr @fc, align 4
332  %6 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
333  %7 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
334  %8 = load float, ptr getelementptr inbounds ([4 x float], ptr @fd, i32 0, i64 1), align 4
335  %9 = fadd float %7, %8
336  %10 = fsub float %6, %9
337  store float %10, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 1), align 4
338  %11 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
339  %12 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
340  %13 = load float, ptr getelementptr inbounds ([4 x float], ptr @fd, i32 0, i64 2), align 4
341  %14 = fsub float %12, %13
342  %15 = fadd float %11, %14
343  store float %15, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 2), align 4
344  %16 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 3), align 4
345  %17 = load float, ptr getelementptr inbounds ([4 x float], ptr @fd, i32 0, i64 3), align 4
346  %18 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 3), align 4
347  %19 = fadd float %17, %18
348  %20 = fsub float %16, %19
349  store float %20, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 3), align 4
350  ret void
351}
352
353; Check vectorization of following code for double data type-
354;  c[0] = (a[0]+b[0])-d[0];
355;  c[1] = d[1]+(a[1]+b[1]); //swapped d[1] and (a[1]+b[1])
356
357define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %d) {
358; CHECK-LABEL: @reorder_alt_rightsubTree(
359; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[D:%.*]], align 8
360; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
361; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
362; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
363; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], [[TMP1]]
364; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP1]]
365; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
366; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
367; CHECK-NEXT:    ret void
368;
369  %1 = load double, ptr %a
370  %2 = load double, ptr %b
371  %3 = fadd double %1, %2
372  %4 = load double, ptr %d
373  %5 = fsub double %3, %4
374  store double %5, ptr %c
375  %6 = getelementptr inbounds double, ptr %d, i64 1
376  %7 = load double, ptr %6
377  %8 = getelementptr inbounds double, ptr %a, i64 1
378  %9 = load double, ptr %8
379  %10 = getelementptr inbounds double, ptr %b, i64 1
380  %11 = load double, ptr %10
381  %12 = fadd double %9, %11
382  %13 = fadd double %7, %12
383  %14 = getelementptr inbounds double, ptr %c, i64 1
384  store double %13, ptr %14
385  ret void
386}
387
388define void @vec_shuff_reorder() #0 {
389; CHECK-LABEL: @vec_shuff_reorder(
390; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr @fa, align 4
391; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4
392; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
393; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
394; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0)
395; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
396; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
397; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2)
398; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]]
399; CHECK-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]]
400; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
401; CHECK-NEXT:    store <4 x float> [[TMP11]], ptr @fc, align 4
402; CHECK-NEXT:    ret void
403;
404  %1 = load float, ptr @fb, align 4
405  %2 = load float, ptr @fa, align 4
406  %3 = fadd float %1, %2
407  store float %3, ptr @fc, align 4
408  %4 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
409  %5 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
410  %6 = fsub float %4, %5
411  store float %6, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 1), align 4
412  %7 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
413  %8 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
414  %9 = fadd float %7, %8
415  store float %9, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 2), align 4
416  %10 = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 3), align 4
417  %11 = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 3), align 4
418  %12 = fsub float %10, %11
419  store float %12, ptr getelementptr inbounds ([4 x float], ptr @fc, i32 0, i64 3), align 4
420  ret void
421}
422
423
424attributes #0 = { nounwind }
425
426