xref: /llvm-project/llvm/test/CodeGen/X86/haddsub-undef.ll (revision 05443aded7b2fa43af01bc0cfab024277855ca30)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE,SSE-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
8
9; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
10
11define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
12; SSE-LABEL: test1_undef:
13; SSE:       # %bb.0:
14; SSE-NEXT:    haddps %xmm1, %xmm0
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: test1_undef:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
20; AVX-NEXT:    retq
21  %vecext = extractelement <4 x float> %a, i32 0
22  %vecext1 = extractelement <4 x float> %a, i32 1
23  %add = fadd float %vecext, %vecext1
24  %vecinit = insertelement <4 x float> undef, float %add, i32 0
25  %vecext2 = extractelement <4 x float> %a, i32 2
26  %vecext3 = extractelement <4 x float> %a, i32 3
27  %add4 = fadd float %vecext2, %vecext3
28  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
29  %vecext10 = extractelement <4 x float> %b, i32 2
30  %vecext11 = extractelement <4 x float> %b, i32 3
31  %add12 = fadd float %vecext10, %vecext11
32  %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
33  ret <4 x float> %vecinit13
34}
35
36define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
37; SSE-LABEL: test2_undef:
38; SSE:       # %bb.0:
39; SSE-NEXT:    haddps %xmm1, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: test2_undef:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
45; AVX-NEXT:    retq
46  %vecext = extractelement <4 x float> %a, i32 0
47  %vecext1 = extractelement <4 x float> %a, i32 1
48  %add = fadd float %vecext, %vecext1
49  %vecinit = insertelement <4 x float> undef, float %add, i32 0
50  %vecext6 = extractelement <4 x float> %b, i32 0
51  %vecext7 = extractelement <4 x float> %b, i32 1
52  %add8 = fadd float %vecext6, %vecext7
53  %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2
54  %vecext10 = extractelement <4 x float> %b, i32 2
55  %vecext11 = extractelement <4 x float> %b, i32 3
56  %add12 = fadd float %vecext10, %vecext11
57  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
58  ret <4 x float> %vecinit13
59}
60
61define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
62; SSE-LABEL: test3_undef:
63; SSE:       # %bb.0:
64; SSE-NEXT:    haddps %xmm1, %xmm0
65; SSE-NEXT:    retq
66;
67; AVX-LABEL: test3_undef:
68; AVX:       # %bb.0:
69; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
70; AVX-NEXT:    retq
71  %vecext = extractelement <4 x float> %a, i32 0
72  %vecext1 = extractelement <4 x float> %a, i32 1
73  %add = fadd float %vecext, %vecext1
74  %vecinit = insertelement <4 x float> undef, float %add, i32 0
75  %vecext2 = extractelement <4 x float> %a, i32 2
76  %vecext3 = extractelement <4 x float> %a, i32 3
77  %add4 = fadd float %vecext2, %vecext3
78  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
79  %vecext6 = extractelement <4 x float> %b, i32 0
80  %vecext7 = extractelement <4 x float> %b, i32 1
81  %add8 = fadd float %vecext6, %vecext7
82  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
83  ret <4 x float> %vecinit9
84}
85
86define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
87; SSE-SLOW-LABEL: test4_undef:
88; SSE-SLOW:       # %bb.0:
89; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
90; SSE-SLOW-NEXT:    addss %xmm1, %xmm0
91; SSE-SLOW-NEXT:    retq
92;
93; SSE-FAST-LABEL: test4_undef:
94; SSE-FAST:       # %bb.0:
95; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
96; SSE-FAST-NEXT:    retq
97;
98; AVX-SLOW-LABEL: test4_undef:
99; AVX-SLOW:       # %bb.0:
100; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
101; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
102; AVX-SLOW-NEXT:    retq
103;
104; AVX-FAST-LABEL: test4_undef:
105; AVX-FAST:       # %bb.0:
106; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
107; AVX-FAST-NEXT:    retq
108  %vecext = extractelement <4 x float> %a, i32 0
109  %vecext1 = extractelement <4 x float> %a, i32 1
110  %add = fadd float %vecext, %vecext1
111  %vecinit = insertelement <4 x float> undef, float %add, i32 0
112  ret <4 x float> %vecinit
113}
114
115define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
116; SSE-SLOW-LABEL: test5_undef:
117; SSE-SLOW:       # %bb.0:
118; SSE-SLOW-NEXT:    movapd %xmm0, %xmm1
119; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
120; SSE-SLOW-NEXT:    addsd %xmm1, %xmm0
121; SSE-SLOW-NEXT:    retq
122;
123; SSE-FAST-LABEL: test5_undef:
124; SSE-FAST:       # %bb.0:
125; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
126; SSE-FAST-NEXT:    retq
127;
128; AVX-SLOW-LABEL: test5_undef:
129; AVX-SLOW:       # %bb.0:
130; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
131; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
132; AVX-SLOW-NEXT:    retq
133;
134; AVX-FAST-LABEL: test5_undef:
135; AVX-FAST:       # %bb.0:
136; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
137; AVX-FAST-NEXT:    retq
138  %vecext = extractelement <2 x double> %a, i32 0
139  %vecext1 = extractelement <2 x double> %a, i32 1
140  %add = fadd double %vecext, %vecext1
141  %vecinit = insertelement <2 x double> undef, double %add, i32 0
142  ret <2 x double> %vecinit
143}
144
145define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
146; SSE-LABEL: test6_undef:
147; SSE:       # %bb.0:
148; SSE-NEXT:    haddps %xmm0, %xmm0
149; SSE-NEXT:    retq
150;
151; AVX-LABEL: test6_undef:
152; AVX:       # %bb.0:
153; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
154; AVX-NEXT:    retq
155  %vecext = extractelement <4 x float> %a, i32 0
156  %vecext1 = extractelement <4 x float> %a, i32 1
157  %add = fadd float %vecext, %vecext1
158  %vecinit = insertelement <4 x float> undef, float %add, i32 0
159  %vecext2 = extractelement <4 x float> %a, i32 2
160  %vecext3 = extractelement <4 x float> %a, i32 3
161  %add4 = fadd float %vecext2, %vecext3
162  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
163  ret <4 x float> %vecinit5
164}
165
166define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
167; SSE-LABEL: test7_undef:
168; SSE:       # %bb.0:
169; SSE-NEXT:    haddps %xmm1, %xmm0
170; SSE-NEXT:    retq
171;
172; AVX-LABEL: test7_undef:
173; AVX:       # %bb.0:
174; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
175; AVX-NEXT:    retq
176  %vecext = extractelement <4 x float> %b, i32 0
177  %vecext1 = extractelement <4 x float> %b, i32 1
178  %add = fadd float %vecext, %vecext1
179  %vecinit = insertelement <4 x float> undef, float %add, i32 2
180  %vecext2 = extractelement <4 x float> %b, i32 2
181  %vecext3 = extractelement <4 x float> %b, i32 3
182  %add4 = fadd float %vecext2, %vecext3
183  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
184  ret <4 x float> %vecinit5
185}
186
187define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
188; SSE-SLOW-LABEL: test8_undef:
189; SSE-SLOW:       # %bb.0:
190; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
191; SSE-SLOW-NEXT:    addss %xmm0, %xmm1
192; SSE-SLOW-NEXT:    movaps %xmm0, %xmm2
193; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
194; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
195; SSE-SLOW-NEXT:    addss %xmm2, %xmm0
196; SSE-SLOW-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
197; SSE-SLOW-NEXT:    movaps %xmm1, %xmm0
198; SSE-SLOW-NEXT:    retq
199;
200; SSE-FAST-LABEL: test8_undef:
201; SSE-FAST:       # %bb.0:
202; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
203; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,1]
204; SSE-FAST-NEXT:    retq
205;
206; AVX-SLOW-LABEL: test8_undef:
207; AVX-SLOW:       # %bb.0:
208; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
209; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm1
210; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
211; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
212; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm0
213; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
214; AVX-SLOW-NEXT:    retq
215;
216; AVX-FAST-LABEL: test8_undef:
217; AVX-FAST:       # %bb.0:
218; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
219; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
220; AVX-FAST-NEXT:    retq
221  %vecext = extractelement <4 x float> %a, i32 0
222  %vecext1 = extractelement <4 x float> %a, i32 1
223  %add = fadd float %vecext, %vecext1
224  %vecinit = insertelement <4 x float> undef, float %add, i32 0
225  %vecext2 = extractelement <4 x float> %a, i32 2
226  %vecext3 = extractelement <4 x float> %a, i32 3
227  %add4 = fadd float %vecext2, %vecext3
228  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
229  ret <4 x float> %vecinit5
230}
231
232define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
233; SSE-LABEL: test9_undef:
234; SSE:       # %bb.0:
235; SSE-NEXT:    haddps %xmm1, %xmm0
236; SSE-NEXT:    retq
237;
238; AVX-LABEL: test9_undef:
239; AVX:       # %bb.0:
240; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
241; AVX-NEXT:    retq
242  %vecext = extractelement <4 x float> %a, i32 0
243  %vecext1 = extractelement <4 x float> %a, i32 1
244  %add = fadd float %vecext, %vecext1
245  %vecinit = insertelement <4 x float> undef, float %add, i32 0
246  %vecext2 = extractelement <4 x float> %b, i32 2
247  %vecext3 = extractelement <4 x float> %b, i32 3
248  %add4 = fadd float %vecext2, %vecext3
249  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
250  ret <4 x float> %vecinit5
251}
252
253define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
254; SSE-LABEL: test10_undef:
255; SSE:       # %bb.0:
256; SSE-NEXT:    haddps %xmm2, %xmm0
257; SSE-NEXT:    retq
258;
259; AVX-LABEL: test10_undef:
260; AVX:       # %bb.0:
261; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
262; AVX-NEXT:    retq
263  %vecext = extractelement <8 x float> %a, i32 0
264  %vecext1 = extractelement <8 x float> %a, i32 1
265  %add = fadd float %vecext, %vecext1
266  %vecinit = insertelement <8 x float> undef, float %add, i32 0
267  %vecext2 = extractelement <8 x float> %b, i32 2
268  %vecext3 = extractelement <8 x float> %b, i32 3
269  %add4 = fadd float %vecext2, %vecext3
270  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
271  ret <8 x float> %vecinit5
272}
273
274define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
275; SSE-SLOW-LABEL: test11_undef:
276; SSE-SLOW:       # %bb.0:
277; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
278; SSE-SLOW-NEXT:    addss %xmm1, %xmm0
279; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
280; SSE-SLOW-NEXT:    addss %xmm3, %xmm1
281; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
282; SSE-SLOW-NEXT:    retq
283;
284; SSE-FAST-LABEL: test11_undef:
285; SSE-FAST:       # %bb.0:
286; SSE-FAST-NEXT:    movaps %xmm3, %xmm1
287; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
288; SSE-FAST-NEXT:    haddps %xmm3, %xmm1
289; SSE-FAST-NEXT:    retq
290;
291; AVX-LABEL: test11_undef:
292; AVX:       # %bb.0:
293; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
294; AVX-NEXT:    retq
295  %vecext = extractelement <8 x float> %a, i32 0
296  %vecext1 = extractelement <8 x float> %a, i32 1
297  %add = fadd float %vecext, %vecext1
298  %vecinit = insertelement <8 x float> undef, float %add, i32 0
299  %vecext2 = extractelement <8 x float> %b, i32 4
300  %vecext3 = extractelement <8 x float> %b, i32 5
301  %add4 = fadd float %vecext2, %vecext3
302  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
303  ret <8 x float> %vecinit5
304}
305
306define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
307; SSE-LABEL: test12_undef:
308; SSE:       # %bb.0:
309; SSE-NEXT:    haddps %xmm0, %xmm0
310; SSE-NEXT:    retq
311;
312; AVX-LABEL: test12_undef:
313; AVX:       # %bb.0:
314; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
315; AVX-NEXT:    retq
316  %vecext = extractelement <8 x float> %a, i32 0
317  %vecext1 = extractelement <8 x float> %a, i32 1
318  %add = fadd float %vecext, %vecext1
319  %vecinit = insertelement <8 x float> undef, float %add, i32 0
320  %vecext2 = extractelement <8 x float> %a, i32 2
321  %vecext3 = extractelement <8 x float> %a, i32 3
322  %add4 = fadd float %vecext2, %vecext3
323  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
324  ret <8 x float> %vecinit5
325}
326
327define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
328; SSE-LABEL: test13_undef:
329; SSE:       # %bb.0:
330; SSE-NEXT:    haddps %xmm1, %xmm0
331; SSE-NEXT:    retq
332;
333; AVX-LABEL: test13_undef:
334; AVX:       # %bb.0:
335; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
336; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
337; AVX-NEXT:    retq
338  %vecext = extractelement <8 x float> %a, i32 0
339  %vecext1 = extractelement <8 x float> %a, i32 1
340  %add1 = fadd float %vecext, %vecext1
341  %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0
342  %vecext2 = extractelement <8 x float> %a, i32 2
343  %vecext3 = extractelement <8 x float> %a, i32 3
344  %add2 = fadd float %vecext2, %vecext3
345  %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1
346  %vecext4 = extractelement <8 x float> %a, i32 4
347  %vecext5 = extractelement <8 x float> %a, i32 5
348  %add3 = fadd float %vecext4, %vecext5
349  %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2
350  %vecext6 = extractelement <8 x float> %a, i32 6
351  %vecext7 = extractelement <8 x float> %a, i32 7
352  %add4 = fadd float %vecext6, %vecext7
353  %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
354  ret <8 x float> %vecinit4
355}
356
357define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) {
358; SSE-LABEL: test13_v16f32_undef:
359; SSE:       # %bb.0:
360; SSE-NEXT:    haddps %xmm1, %xmm0
361; SSE-NEXT:    retq
362;
363; AVX1-SLOW-LABEL: test13_v16f32_undef:
364; AVX1-SLOW:       # %bb.0:
365; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
366; AVX1-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
367; AVX1-SLOW-NEXT:    retq
368;
369; AVX-FAST-LABEL: test13_v16f32_undef:
370; AVX-FAST:       # %bb.0:
371; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
372; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
373; AVX-FAST-NEXT:    retq
374;
375; AVX512-SLOW-LABEL: test13_v16f32_undef:
376; AVX512-SLOW:       # %bb.0:
377; AVX512-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
378; AVX512-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm1
379; AVX512-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
380; AVX512-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
381; AVX512-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
382; AVX512-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
383; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
384; AVX512-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
385; AVX512-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm2
386; AVX512-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
387; AVX512-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
388; AVX512-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
389; AVX512-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm0
390; AVX512-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
391; AVX512-SLOW-NEXT:    retq
392  %vecext = extractelement <16 x float> %a, i32 0
393  %vecext1 = extractelement <16 x float> %a, i32 1
394  %add1 = fadd float %vecext, %vecext1
395  %vecinit1 = insertelement <16 x float> undef, float %add1, i32 0
396  %vecext2 = extractelement <16 x float> %a, i32 2
397  %vecext3 = extractelement <16 x float> %a, i32 3
398  %add2 = fadd float %vecext2, %vecext3
399  %vecinit2 = insertelement <16 x float> %vecinit1, float %add2, i32 1
400  %vecext4 = extractelement <16 x float> %a, i32 4
401  %vecext5 = extractelement <16 x float> %a, i32 5
402  %add3 = fadd float %vecext4, %vecext5
403  %vecinit3 = insertelement <16 x float> %vecinit2, float %add3, i32 2
404  %vecext6 = extractelement <16 x float> %a, i32 6
405  %vecext7 = extractelement <16 x float> %a, i32 7
406  %add4 = fadd float %vecext6, %vecext7
407  %vecinit4 = insertelement <16 x float> %vecinit3, float %add4, i32 3
408  ret <16 x float> %vecinit4
409}
410define <2 x double> @add_pd_003(<2 x double> %x) {
411; SSE-SLOW-LABEL: add_pd_003:
412; SSE-SLOW:       # %bb.0:
413; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
414; SSE-SLOW-NEXT:    addpd %xmm1, %xmm0
415; SSE-SLOW-NEXT:    retq
416;
417; SSE-FAST-LABEL: add_pd_003:
418; SSE-FAST:       # %bb.0:
419; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
420; SSE-FAST-NEXT:    retq
421;
422; AVX-SLOW-LABEL: add_pd_003:
423; AVX-SLOW:       # %bb.0:
424; AVX-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
425; AVX-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
426; AVX-SLOW-NEXT:    retq
427;
428; AVX-FAST-LABEL: add_pd_003:
429; AVX-FAST:       # %bb.0:
430; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
431; AVX-FAST-NEXT:    retq
432  %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
433  %add = fadd <2 x double> %l, %x
434  ret <2 x double> %add
435}
436
437; Change shuffle mask - no undefs.
438
439define <2 x double> @add_pd_003_2(<2 x double> %x) {
440; SSE-SLOW-LABEL: add_pd_003_2:
441; SSE-SLOW:       # %bb.0:
442; SSE-SLOW-NEXT:    movapd %xmm0, %xmm1
443; SSE-SLOW-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
444; SSE-SLOW-NEXT:    addpd %xmm1, %xmm0
445; SSE-SLOW-NEXT:    retq
446;
447; SSE-FAST-LABEL: add_pd_003_2:
448; SSE-FAST:       # %bb.0:
449; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
450; SSE-FAST-NEXT:    retq
451;
452; AVX-SLOW-LABEL: add_pd_003_2:
453; AVX-SLOW:       # %bb.0:
454; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
455; AVX-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
456; AVX-SLOW-NEXT:    retq
457;
458; AVX-FAST-LABEL: add_pd_003_2:
459; AVX-FAST:       # %bb.0:
460; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
461; AVX-FAST-NEXT:    retq
462  %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
463  %add = fadd <2 x double> %l, %x
464  ret <2 x double> %add
465}
466
467define <2 x double> @add_pd_010(<2 x double> %x) {
468; SSE-LABEL: add_pd_010:
469; SSE:       # %bb.0:
470; SSE-NEXT:    haddpd %xmm0, %xmm0
471; SSE-NEXT:    retq
472;
473; AVX-SLOW-LABEL: add_pd_010:
474; AVX-SLOW:       # %bb.0:
475; AVX-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
476; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
477; AVX-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
478; AVX-SLOW-NEXT:    retq
479;
480; AVX-FAST-LABEL: add_pd_010:
481; AVX-FAST:       # %bb.0:
482; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
483; AVX-FAST-NEXT:    retq
484  %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
485  %add = fadd <2 x double> %l, %x
486  %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
487  ret <2 x double> %shuffle2
488}
489
490define <4 x float> @add_ps_007(<4 x float> %x) {
491; SSE-LABEL: add_ps_007:
492; SSE:       # %bb.0:
493; SSE-NEXT:    haddps %xmm0, %xmm0
494; SSE-NEXT:    retq
495;
496; AVX-LABEL: add_ps_007:
497; AVX:       # %bb.0:
498; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
499; AVX-NEXT:    retq
500  %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
501  %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
502  %add = fadd <4 x float> %l, %r
503  ret <4 x float> %add
504}
505
506define <4 x float> @add_ps_030(<4 x float> %x) {
507; SSE-SLOW-LABEL: add_ps_030:
508; SSE-SLOW:       # %bb.0:
509; SSE-SLOW-NEXT:    movaps %xmm0, %xmm1
510; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
511; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
512; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
513; SSE-SLOW-NEXT:    retq
514;
515; SSE-FAST-LABEL: add_ps_030:
516; SSE-FAST:       # %bb.0:
517; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
518; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
519; SSE-FAST-NEXT:    retq
520;
521; AVX-SLOW-LABEL: add_ps_030:
522; AVX-SLOW:       # %bb.0:
523; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3]
524; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
525; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
526; AVX-SLOW-NEXT:    retq
527;
528; AVX-FAST-LABEL: add_ps_030:
529; AVX-FAST:       # %bb.0:
530; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
531; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
532; AVX-FAST-NEXT:    retq
533  %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
534  %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
535  %add = fadd <4 x float> %l, %r
536  %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
537  ret <4 x float> %shuffle2
538}
539
540define <4 x float> @add_ps_007_2(<4 x float> %x) {
541; SSE-LABEL: add_ps_007_2:
542; SSE:       # %bb.0:
543; SSE-NEXT:    haddps %xmm0, %xmm0
544; SSE-NEXT:    retq
545;
546; AVX-LABEL: add_ps_007_2:
547; AVX:       # %bb.0:
548; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
549; AVX-NEXT:    retq
550  %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
551  %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
552  %add = fadd <4 x float> %l, %r
553  ret <4 x float> %add
554}
555
556define <4 x float> @add_ps_008(<4 x float> %x) {
557; SSE-SLOW-LABEL: add_ps_008:
558; SSE-SLOW:       # %bb.0:
559; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
560; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
561; SSE-SLOW-NEXT:    retq
562;
563; SSE-FAST-LABEL: add_ps_008:
564; SSE-FAST:       # %bb.0:
565; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
566; SSE-FAST-NEXT:    retq
567;
568; AVX-SLOW-LABEL: add_ps_008:
569; AVX-SLOW:       # %bb.0:
570; AVX-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
571; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
572; AVX-SLOW-NEXT:    retq
573;
574; AVX-FAST-LABEL: add_ps_008:
575; AVX-FAST:       # %bb.0:
576; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
577; AVX-FAST-NEXT:    retq
578  %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
579  %add = fadd <4 x float> %l, %x
580  ret <4 x float> %add
581}
582
583define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
584; SSE-LABEL: add_ps_016:
585; SSE:       # %bb.0:
586; SSE-NEXT:    haddps %xmm0, %xmm1
587; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0,3,3]
588; SSE-NEXT:    movaps %xmm1, %xmm0
589; SSE-NEXT:    retq
590;
591; AVX-LABEL: add_ps_016:
592; AVX:       # %bb.0:
593; AVX-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
594; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3]
595; AVX-NEXT:    retq
596  %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 0, i32 6>
597  %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 1, i32 7>
598  %5 = fadd <2 x float> %3, %4
599  %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
600  %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
601  %8 = fadd <4 x float> %7, %1
602  %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 6, i32 1, i32 2, i32 undef>
603  ret <4 x float> %9
604}
605
606define <4 x float> @add_ps_017(<4 x float> %x) {
607; SSE-SLOW-LABEL: add_ps_017:
608; SSE-SLOW:       # %bb.0:
609; SSE-SLOW-NEXT:    movaps %xmm0, %xmm1
610; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
611; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
612; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
613; SSE-SLOW-NEXT:    retq
614;
615; SSE-FAST-LABEL: add_ps_017:
616; SSE-FAST:       # %bb.0:
617; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
618; SSE-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
619; SSE-FAST-NEXT:    retq
620;
621; AVX-SLOW-LABEL: add_ps_017:
622; AVX-SLOW:       # %bb.0:
623; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
624; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
625; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
626; AVX-SLOW-NEXT:    retq
627;
628; AVX-FAST-LABEL: add_ps_017:
629; AVX-FAST:       # %bb.0:
630; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
631; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
632; AVX-FAST-NEXT:    retq
633  %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
634  %add = fadd <4 x float> %l, %x
635  %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
636  ret <4 x float> %shuffle2
637}
638
639define <4 x float> @add_ps_018(<4 x float> %x) {
640; SSE-LABEL: add_ps_018:
641; SSE:       # %bb.0:
642; SSE-NEXT:    haddps %xmm0, %xmm0
643; SSE-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
644; SSE-NEXT:    retq
645;
646; AVX1-SLOW-LABEL: add_ps_018:
647; AVX1-SLOW:       # %bb.0:
648; AVX1-SLOW-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
649; AVX1-SLOW-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
650; AVX1-SLOW-NEXT:    retq
651;
652; AVX1-FAST-LABEL: add_ps_018:
653; AVX1-FAST:       # %bb.0:
654; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
655; AVX1-FAST-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
656; AVX1-FAST-NEXT:    retq
657;
658; AVX512-LABEL: add_ps_018:
659; AVX512:       # %bb.0:
660; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
661; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
662; AVX512-NEXT:    retq
663  %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
664  %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
665  %add = fadd <4 x float> %l, %r
666  %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
667  ret <4 x float> %shuffle2
668}
669
670define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) {
671; SSE-SLOW-LABEL: add_pd_011:
672; SSE-SLOW:       # %bb.0:
673; SSE-SLOW-NEXT:    movapd %xmm2, %xmm1
674; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
675; SSE-SLOW-NEXT:    movapd %xmm0, %xmm3
676; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
677; SSE-SLOW-NEXT:    addpd %xmm3, %xmm0
678; SSE-SLOW-NEXT:    addpd %xmm2, %xmm1
679; SSE-SLOW-NEXT:    retq
680;
681; SSE-FAST-LABEL: add_pd_011:
682; SSE-FAST:       # %bb.0:
683; SSE-FAST-NEXT:    movapd %xmm2, %xmm1
684; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
685; SSE-FAST-NEXT:    haddpd %xmm2, %xmm1
686; SSE-FAST-NEXT:    retq
687;
688; AVX1-SLOW-LABEL: add_pd_011:
689; AVX1-SLOW:       # %bb.0:
690; AVX1-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
691; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
692; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
693; AVX1-SLOW-NEXT:    retq
694;
695; AVX1-FAST-LABEL: add_pd_011:
696; AVX1-FAST:       # %bb.0:
697; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
698; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
699; AVX1-FAST-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
700; AVX1-FAST-NEXT:    retq
701;
702; AVX512-LABEL: add_pd_011:
703; AVX512:       # %bb.0:
704; AVX512-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
705; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
706; AVX512-NEXT:    retq
707  %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
708  %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 undef, i32 5, i32 undef>
709  %5 = fadd <4 x double> %3, %4
710  %6 = shufflevector <4 x double> %5, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
711  ret <4 x double> %6
712}
713
714define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %b) {
715; SSE-LABEL: v8f32_inputs_v4f32_output_0101:
716; SSE:       # %bb.0:
717; SSE-NEXT:    haddps %xmm2, %xmm0
718; SSE-NEXT:    retq
719;
720; AVX-LABEL: v8f32_inputs_v4f32_output_0101:
721; AVX:       # %bb.0:
722; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
723; AVX-NEXT:    vzeroupper
724; AVX-NEXT:    retq
725  %a0 = extractelement <8 x float> %a, i32 0
726  %a1 = extractelement <8 x float> %a, i32 1
727  %b0 = extractelement <8 x float> %b, i32 0
728  %b1 = extractelement <8 x float> %b, i32 1
729  %add0 = fadd float %a0, %a1
730  %add2 = fadd float %b0, %b1
731  %r0 = insertelement <4 x float> undef, float %add0, i32 0
732  %r = insertelement <4 x float> %r0, float %add2, i32 2
733  ret <4 x float> %r
734}
735
736define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %b) {
737; SSE-LABEL: v8f32_input0_v4f32_output_0123:
738; SSE:       # %bb.0:
739; SSE-NEXT:    haddps %xmm2, %xmm0
740; SSE-NEXT:    retq
741;
742; AVX-LABEL: v8f32_input0_v4f32_output_0123:
743; AVX:       # %bb.0:
744; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
745; AVX-NEXT:    vzeroupper
746; AVX-NEXT:    retq
747  %a0 = extractelement <8 x float> %a, i32 0
748  %a1 = extractelement <8 x float> %a, i32 1
749  %b2 = extractelement <4 x float> %b, i32 2
750  %b3 = extractelement <4 x float> %b, i32 3
751  %add0 = fadd float %a0, %a1
752  %add3 = fadd float %b2, %b3
753  %r0 = insertelement <4 x float> undef, float %add0, i32 0
754  %r = insertelement <4 x float> %r0, float %add3, i32 3
755  ret <4 x float> %r
756}
757
758define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %b) {
759; SSE-LABEL: v8f32_input1_v4f32_output_2301:
760; SSE:       # %bb.0:
761; SSE-NEXT:    haddps %xmm1, %xmm0
762; SSE-NEXT:    retq
763;
764; AVX-LABEL: v8f32_input1_v4f32_output_2301:
765; AVX:       # %bb.0:
766; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
767; AVX-NEXT:    vzeroupper
768; AVX-NEXT:    retq
769  %a2 = extractelement <4 x float> %a, i32 2
770  %a3 = extractelement <4 x float> %a, i32 3
771  %b0 = extractelement <8 x float> %b, i32 0
772  %b1 = extractelement <8 x float> %b, i32 1
773  %add1 = fadd float %a2, %a3
774  %add2 = fadd float %b0, %b1
775  %r1 = insertelement <4 x float> undef, float %add1, i32 1
776  %r = insertelement <4 x float> %r1, float %add2, i32 2
777  ret <4 x float> %r
778}
779
780define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %b) {
781; SSE-LABEL: v8f32_inputs_v4f32_output_2323:
782; SSE:       # %bb.0:
783; SSE-NEXT:    haddps %xmm2, %xmm0
784; SSE-NEXT:    retq
785;
786; AVX-LABEL: v8f32_inputs_v4f32_output_2323:
787; AVX:       # %bb.0:
788; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
789; AVX-NEXT:    vzeroupper
790; AVX-NEXT:    retq
791  %a2 = extractelement <8 x float> %a, i32 2
792  %a3 = extractelement <8 x float> %a, i32 3
793  %b2 = extractelement <8 x float> %b, i32 2
794  %b3 = extractelement <8 x float> %b, i32 3
795  %add1 = fadd float %a2, %a3
796  %add3 = fadd float %b2, %b3
797  %r1 = insertelement <4 x float> undef, float %add1, i32 1
798  %r = insertelement <4 x float> %r1, float %add3, i32 3
799  ret <4 x float> %r
800}
801
802define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float> %b) {
803; SSE-LABEL: v16f32_inputs_v4f32_output_0123:
804; SSE:       # %bb.0:
805; SSE-NEXT:    haddps %xmm4, %xmm0
806; SSE-NEXT:    retq
807;
808; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
809; AVX1-SLOW:       # %bb.0:
810; AVX1-SLOW-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
811; AVX1-SLOW-NEXT:    vzeroupper
812; AVX1-SLOW-NEXT:    retq
813;
814; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
815; AVX1-FAST:       # %bb.0:
816; AVX1-FAST-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
817; AVX1-FAST-NEXT:    vzeroupper
818; AVX1-FAST-NEXT:    retq
819;
820; AVX512-LABEL: v16f32_inputs_v4f32_output_0123:
821; AVX512:       # %bb.0:
822; AVX512-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
823; AVX512-NEXT:    vzeroupper
824; AVX512-NEXT:    retq
825  %a0 = extractelement <16 x float> %a, i32 0
826  %a1 = extractelement <16 x float> %a, i32 1
827  %b2 = extractelement <16 x float> %b, i32 2
828  %b3 = extractelement <16 x float> %b, i32 3
829  %add0 = fadd float %a0, %a1
830  %add3 = fadd float %b2, %b3
831  %r0 = insertelement <4 x float> undef, float %add0, i32 0
832  %r = insertelement <4 x float> %r0, float %add3, i32 3
833  ret <4 x float> %r
834}
835
836define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float> %b) {
837; SSE-LABEL: v16f32_inputs_v8f32_output_4567:
838; SSE:       # %bb.0:
839; SSE-NEXT:    haddps %xmm5, %xmm1
840; SSE-NEXT:    retq
841;
842; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567:
843; AVX1-SLOW:       # %bb.0:
844; AVX1-SLOW-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
845; AVX1-SLOW-NEXT:    retq
846;
847; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567:
848; AVX1-FAST:       # %bb.0:
849; AVX1-FAST-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
850; AVX1-FAST-NEXT:    retq
851;
852; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
853; AVX512:       # %bb.0:
854; AVX512-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
855; AVX512-NEXT:    retq
856  %a4 = extractelement <16 x float> %a, i32 4
857  %a5 = extractelement <16 x float> %a, i32 5
858  %b6 = extractelement <16 x float> %b, i32 6
859  %b7 = extractelement <16 x float> %b, i32 7
860  %add4 = fadd float %a4, %a5
861  %add7 = fadd float %b6, %b7
862  %r4 = insertelement <8 x float> undef, float %add4, i32 4
863  %r = insertelement <8 x float> %r4, float %add7, i32 7
864  ret <8 x float> %r
865}
866
867define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) {
868; SSE-LABEL: PR40243:
869; SSE:       # %bb.0:
870; SSE-NEXT:    haddps %xmm3, %xmm1
871; SSE-NEXT:    retq
872;
873; AVX-LABEL: PR40243:
874; AVX:       # %bb.0:
875; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
876; AVX-NEXT:    retq
877  %a4 = extractelement <8 x float> %a, i32 4
878  %a5 = extractelement <8 x float> %a, i32 5
879  %add4 = fadd float %a4, %a5
880  %b6 = extractelement <8 x float> %b, i32 6
881  %b7 = extractelement <8 x float> %b, i32 7
882  %add7 = fadd float %b6, %b7
883  %r4 = insertelement <8 x float> undef, float %add4, i32 4
884  %r = insertelement <8 x float> %r4, float %add7, i32 7
885  ret <8 x float> %r
886}
887
888define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
889; SSE-SLOW-LABEL: PR44694:
890; SSE-SLOW:       # %bb.0:
891; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
892; SSE-SLOW-NEXT:    haddpd %xmm3, %xmm2
893; SSE-SLOW-NEXT:    addpd %xmm1, %xmm0
894; SSE-SLOW-NEXT:    movapd %xmm2, %xmm1
895; SSE-SLOW-NEXT:    retq
896;
897; SSE-FAST-LABEL: PR44694:
898; SSE-FAST:       # %bb.0:
899; SSE-FAST-NEXT:    movapd %xmm1, %xmm0
900; SSE-FAST-NEXT:    haddpd %xmm3, %xmm2
901; SSE-FAST-NEXT:    haddpd %xmm1, %xmm0
902; SSE-FAST-NEXT:    movapd %xmm2, %xmm1
903; SSE-FAST-NEXT:    retq
904;
905; AVX1-SLOW-LABEL: PR44694:
906; AVX1-SLOW:       # %bb.0:
907; AVX1-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
908; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
909; AVX1-SLOW-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
910; AVX1-SLOW-NEXT:    retq
911;
912; AVX1-FAST-LABEL: PR44694:
913; AVX1-FAST:       # %bb.0:
914; AVX1-FAST-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
915; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
916; AVX1-FAST-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
917; AVX1-FAST-NEXT:    retq
918;
919; AVX512-LABEL: PR44694:
920; AVX512:       # %bb.0:
921; AVX512-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
922; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
923; AVX512-NEXT:    retq
924  %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
925  %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
926  %5 = fadd <4 x double> %3, %4
927  ret <4 x double> %5
928}
929
930define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
931; SSE-SLOW-LABEL: PR45747_1:
932; SSE-SLOW:       # %bb.0:
933; SSE-SLOW-NEXT:    movaps %xmm0, %xmm1
934; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
935; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
936; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
937; SSE-SLOW-NEXT:    retq
938;
939; SSE-FAST-LABEL: PR45747_1:
940; SSE-FAST:       # %bb.0:
941; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
942; SSE-FAST-NEXT:    retq
943;
944; AVX-SLOW-LABEL: PR45747_1:
945; AVX-SLOW:       # %bb.0:
946; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2]
947; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
948; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
949; AVX-SLOW-NEXT:    retq
950;
951; AVX-FAST-LABEL: PR45747_1:
952; AVX-FAST:       # %bb.0:
953; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
954; AVX-FAST-NEXT:    retq
955  %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
956  %t1 = fadd <4 x float> %t0, %a
957  %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
958  ret <4 x float> %shuffle
959}
960
961define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
962; SSE-SLOW-LABEL: PR45747_2:
963; SSE-SLOW:       # %bb.0:
964; SSE-SLOW-NEXT:    movaps %xmm1, %xmm0
965; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
966; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
967; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
968; SSE-SLOW-NEXT:    retq
969;
970; SSE-FAST-LABEL: PR45747_2:
971; SSE-FAST:       # %bb.0:
972; SSE-FAST-NEXT:    haddps %xmm1, %xmm1
973; SSE-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
974; SSE-FAST-NEXT:    retq
975;
976; AVX-SLOW-LABEL: PR45747_2:
977; AVX-SLOW:       # %bb.0:
978; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm1[1,0]
979; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,1,1]
980; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
981; AVX-SLOW-NEXT:    retq
982;
983; AVX-FAST-LABEL: PR45747_2:
984; AVX-FAST:       # %bb.0:
985; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
986; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
987; AVX-FAST-NEXT:    retq
988  %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
989  %t1 = fadd <4 x float> %t0, %b
990  %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
991  ret <4 x float> %shuffle
992}
993
994define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
995; SSE-LABEL: PR34724_add_v4f32_u123:
996; SSE:       # %bb.0:
997; SSE-NEXT:    haddps %xmm1, %xmm0
998; SSE-NEXT:    retq
999;
1000; AVX-LABEL: PR34724_add_v4f32_u123:
1001; AVX:       # %bb.0:
1002; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1003; AVX-NEXT:    retq
1004  %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>
1005  %4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5>
1006  %5 = fadd <2 x float> %3, %4
1007  %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
1008  %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1009  %8 = fadd <4 x float> %7, %1
1010  %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
1011  ret <4 x float> %9
1012}
1013
1014define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
1015; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23:
1016; SSE-SLOW:       # %bb.0:
1017; SSE-SLOW-NEXT:    movaps %xmm0, %xmm2
1018; SSE-SLOW-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1019; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
1020; SSE-SLOW-NEXT:    addps %xmm2, %xmm0
1021; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1022; SSE-SLOW-NEXT:    addps %xmm1, %xmm2
1023; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
1024; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1025; SSE-SLOW-NEXT:    retq
1026;
1027; SSE-FAST-LABEL: PR34724_add_v4f32_0u23:
1028; SSE-FAST:       # %bb.0:
1029; SSE-FAST-NEXT:    haddps %xmm1, %xmm0
1030; SSE-FAST-NEXT:    retq
1031;
1032; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23:
1033; AVX-SLOW:       # %bb.0:
1034; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3]
1035; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2]
1036; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm0, %xmm0
1037; AVX-SLOW-NEXT:    retq
1038;
1039; AVX-FAST-LABEL: PR34724_add_v4f32_0u23:
1040; AVX-FAST:       # %bb.0:
1041; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1042; AVX-FAST-NEXT:    retq
1043  %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1044  %4 = fadd <4 x float> %3, %0
1045  %5 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1046  %6 = fadd <4 x float> %5, %1
1047  %7 = shufflevector <4 x float> %4, <4 x float> %6, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
1048  %8 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1049  %9 = fadd <4 x float> %8, %1
1050  %10 = shufflevector <4 x float> %7, <4 x float> %9, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
1051  ret <4 x float> %10
1052}
1053
1054define <4 x float> @PR34724_add_v4f32_01u3(<4 x float> %0, <4 x float> %1) {
1055; SSE-LABEL: PR34724_add_v4f32_01u3:
1056; SSE:       # %bb.0:
1057; SSE-NEXT:    haddps %xmm1, %xmm0
1058; SSE-NEXT:    retq
1059;
1060; AVX-LABEL: PR34724_add_v4f32_01u3:
1061; AVX:       # %bb.0:
1062; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1063; AVX-NEXT:    retq
1064  %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
1065  %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
1066  %5 = fadd <2 x float> %3, %4
1067  %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1068  %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1069  %8 = fadd <4 x float> %7, %1
1070  %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 undef, i32 7>
1071  ret <4 x float> %9
1072}
1073
1074define <4 x float> @PR34724_add_v4f32_012u(<4 x float> %0, <4 x float> %1) {
1075; SSE-LABEL: PR34724_add_v4f32_012u:
1076; SSE:       # %bb.0:
1077; SSE-NEXT:    haddps %xmm1, %xmm0
1078; SSE-NEXT:    retq
1079;
1080; AVX-LABEL: PR34724_add_v4f32_012u:
1081; AVX:       # %bb.0:
1082; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1083; AVX-NEXT:    retq
1084  %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
1085  %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
1086  %5 = fadd <2 x float> %3, %4
1087  %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1088  %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1089  %8 = fadd <4 x float> %7, %1
1090  %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
1091  ret <4 x float> %9
1092}
1093
1094define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
1095; SSE-SLOW-LABEL: PR34724_add_v4f64_u123:
1096; SSE-SLOW:       # %bb.0:
1097; SSE-SLOW-NEXT:    haddpd %xmm2, %xmm1
1098; SSE-SLOW-NEXT:    movapd %xmm3, %xmm2
1099; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1100; SSE-SLOW-NEXT:    addsd %xmm3, %xmm2
1101; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1102; SSE-SLOW-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
1103; SSE-SLOW-NEXT:    retq
1104;
1105; SSE-FAST-LABEL: PR34724_add_v4f64_u123:
1106; SSE-FAST:       # %bb.0:
1107; SSE-FAST-NEXT:    movapd %xmm1, %xmm0
1108; SSE-FAST-NEXT:    haddpd %xmm3, %xmm2
1109; SSE-FAST-NEXT:    haddpd %xmm1, %xmm0
1110; SSE-FAST-NEXT:    movapd %xmm2, %xmm1
1111; SSE-FAST-NEXT:    retq
1112;
1113; AVX-SLOW-LABEL: PR34724_add_v4f64_u123:
1114; AVX-SLOW:       # %bb.0:
1115; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1116; AVX-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
1117; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
1118; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1119; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1120; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
1121; AVX-SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1122; AVX-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1123; AVX-SLOW-NEXT:    retq
1124;
1125; AVX-FAST-LABEL: PR34724_add_v4f64_u123:
1126; AVX-FAST:       # %bb.0:
1127; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm2
1128; AVX-FAST-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1129; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1130; AVX-FAST-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
1131; AVX-FAST-NEXT:    retq
1132  %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4>
1133  %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5>
1134  %5 = fadd <2 x double> %3, %4
1135  %6 = extractelement <2 x double> %5, i32 0
1136  %7 = insertelement <4 x double> undef, double %6, i32 1
1137  %8 = extractelement <2 x double> %5, i32 1
1138  %9 = insertelement <4 x double> %7, double %8, i32 2
1139  %10 = extractelement <4 x double> %1, i32 2
1140  %11 = extractelement <4 x double> %1, i32 3
1141  %12 = fadd double %10, %11
1142  %13 = insertelement <4 x double> %9, double %12, i32 3
1143  ret <4 x double> %13
1144}
1145
1146define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) {
1147; SSE-SLOW-LABEL: PR34724_add_v4f64_0u23:
1148; SSE-SLOW:       # %bb.0:
1149; SSE-SLOW-NEXT:    haddpd %xmm2, %xmm0
1150; SSE-SLOW-NEXT:    movapd %xmm3, %xmm2
1151; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1152; SSE-SLOW-NEXT:    addsd %xmm3, %xmm2
1153; SSE-SLOW-NEXT:    movapd %xmm0, %xmm1
1154; SSE-SLOW-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
1155; SSE-SLOW-NEXT:    retq
1156;
1157; SSE-FAST-LABEL: PR34724_add_v4f64_0u23:
1158; SSE-FAST:       # %bb.0:
1159; SSE-FAST-NEXT:    movapd %xmm2, %xmm1
1160; SSE-FAST-NEXT:    haddpd %xmm2, %xmm0
1161; SSE-FAST-NEXT:    haddpd %xmm3, %xmm1
1162; SSE-FAST-NEXT:    retq
1163;
1164; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23:
1165; AVX-SLOW:       # %bb.0:
1166; AVX-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
1167; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
1168; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1169; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1170; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
1171; AVX-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1172; AVX-SLOW-NEXT:    retq
1173;
1174; AVX-FAST-LABEL: PR34724_add_v4f64_0u23:
1175; AVX-FAST:       # %bb.0:
1176; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1177; AVX-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1178; AVX-FAST-NEXT:    retq
1179  %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4>
1180  %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5>
1181  %5 = fadd <2 x double> %3, %4
1182  %6 = extractelement <2 x double> %5, i32 0
1183  %7 = insertelement <4 x double> undef, double %6, i32 0
1184  %8 = extractelement <2 x double> %5, i32 1
1185  %9 = insertelement <4 x double> %7, double %8, i32 2
1186  %10 = extractelement <4 x double> %1, i32 2
1187  %11 = extractelement <4 x double> %1, i32 3
1188  %12 = fadd double %10, %11
1189  %13 = insertelement <4 x double> %9, double %12, i32 3
1190  ret <4 x double> %13
1191}
1192
1193define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
1194; SSE-SLOW-LABEL: PR34724_add_v4f64_01u3:
1195; SSE-SLOW:       # %bb.0:
1196; SSE-SLOW-NEXT:    haddpd %xmm1, %xmm0
1197; SSE-SLOW-NEXT:    movapd %xmm3, %xmm1
1198; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1199; SSE-SLOW-NEXT:    addsd %xmm3, %xmm1
1200; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
1201; SSE-SLOW-NEXT:    retq
1202;
1203; SSE-FAST-LABEL: PR34724_add_v4f64_01u3:
1204; SSE-FAST:       # %bb.0:
1205; SSE-FAST-NEXT:    haddpd %xmm1, %xmm0
1206; SSE-FAST-NEXT:    haddpd %xmm3, %xmm3
1207; SSE-FAST-NEXT:    movapd %xmm3, %xmm1
1208; SSE-FAST-NEXT:    retq
1209;
1210; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3:
1211; AVX-SLOW:       # %bb.0:
1212; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
1213; AVX-SLOW-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
1214; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
1215; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1216; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1217; AVX-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
1218; AVX-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1219; AVX-SLOW-NEXT:    retq
1220;
1221; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3:
1222; AVX1-FAST:       # %bb.0:
1223; AVX1-FAST-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1224; AVX1-FAST-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1225; AVX1-FAST-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
1226; AVX1-FAST-NEXT:    retq
1227;
1228; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3:
1229; AVX512-FAST:       # %bb.0:
1230; AVX512-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1231; AVX512-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
1232; AVX512-FAST-NEXT:    retq
1233  %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
1234  %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
1235  %5 = fadd <2 x double> %3, %4
1236  %6 = extractelement <2 x double> %5, i32 0
1237  %7 = insertelement <4 x double> undef, double %6, i32 0
1238  %8 = extractelement <2 x double> %5, i32 1
1239  %9 = insertelement <4 x double> %7, double %8, i32 1
1240  %10 = extractelement <4 x double> %1, i32 2
1241  %11 = extractelement <4 x double> %1, i32 3
1242  %12 = fadd double %10, %11
1243  %13 = insertelement <4 x double> %9, double %12, i32 3
1244  ret <4 x double> %13
1245}
1246
1247define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) {
1248; SSE-SLOW-LABEL: PR34724_add_v4f64_012u:
1249; SSE-SLOW:       # %bb.0:
1250; SSE-SLOW-NEXT:    haddpd %xmm1, %xmm0
1251; SSE-SLOW-NEXT:    movapd %xmm2, %xmm1
1252; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1253; SSE-SLOW-NEXT:    addsd %xmm2, %xmm1
1254; SSE-SLOW-NEXT:    retq
1255;
1256; SSE-FAST-LABEL: PR34724_add_v4f64_012u:
1257; SSE-FAST:       # %bb.0:
1258; SSE-FAST-NEXT:    haddpd %xmm1, %xmm0
1259; SSE-FAST-NEXT:    haddpd %xmm2, %xmm2
1260; SSE-FAST-NEXT:    movapd %xmm2, %xmm1
1261; SSE-FAST-NEXT:    retq
1262;
1263; AVX-SLOW-LABEL: PR34724_add_v4f64_012u:
1264; AVX-SLOW:       # %bb.0:
1265; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
1266; AVX-SLOW-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
1267; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1268; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1269; AVX-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1270; AVX-SLOW-NEXT:    retq
1271;
1272; AVX-FAST-LABEL: PR34724_add_v4f64_012u:
1273; AVX-FAST:       # %bb.0:
1274; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm2
1275; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1276; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1277; AVX-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1278; AVX-FAST-NEXT:    retq
1279  %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
1280  %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
1281  %5 = fadd <2 x double> %3, %4
1282  %6 = extractelement <2 x double> %5, i32 0
1283  %7 = insertelement <4 x double> undef, double %6, i32 0
1284  %8 = extractelement <2 x double> %5, i32 1
1285  %9 = insertelement <4 x double> %7, double %8, i32 1
1286  %10 = extractelement <4 x double> %1, i32 0
1287  %11 = extractelement <4 x double> %1, i32 1
1288  %12 = fadd double %10, %11
1289  %13 = insertelement <4 x double> %9, double %12, i32 2
1290  ret <4 x double> %13
1291}
1292