xref: /llvm-project/llvm/test/CodeGen/X86/haddsub.ll (revision e9f9467da063875bd684e46660e2ff36ba4f55e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST
10
11define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
12; SSE3-LABEL: haddpd1:
13; SSE3:       # %bb.0:
14; SSE3-NEXT:    haddpd %xmm1, %xmm0
15; SSE3-NEXT:    retq
16;
17; AVX-LABEL: haddpd1:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
20; AVX-NEXT:    retq
21  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
22  %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
23  %r = fadd <2 x double> %a, %b
24  ret <2 x double> %r
25}
26
27define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
28; SSE3-LABEL: haddpd2:
29; SSE3:       # %bb.0:
30; SSE3-NEXT:    haddpd %xmm1, %xmm0
31; SSE3-NEXT:    retq
32;
33; AVX-LABEL: haddpd2:
34; AVX:       # %bb.0:
35; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
36; AVX-NEXT:    retq
37  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
38  %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
39  %r = fadd <2 x double> %a, %b
40  ret <2 x double> %r
41}
42
43define <2 x double> @haddpd3(<2 x double> %x) {
44; SSE3-SLOW-LABEL: haddpd3:
45; SSE3-SLOW:       # %bb.0:
46; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
47; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
48; SSE3-SLOW-NEXT:    addpd %xmm1, %xmm0
49; SSE3-SLOW-NEXT:    retq
50;
51; SSE3-FAST-LABEL: haddpd3:
52; SSE3-FAST:       # %bb.0:
53; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
54; SSE3-FAST-NEXT:    retq
55;
56; AVX-SLOW-LABEL: haddpd3:
57; AVX-SLOW:       # %bb.0:
58; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
59; AVX-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
60; AVX-SLOW-NEXT:    retq
61;
62; AVX-FAST-LABEL: haddpd3:
63; AVX-FAST:       # %bb.0:
64; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
65; AVX-FAST-NEXT:    retq
66  %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
67  %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
68  %r = fadd <2 x double> %a, %b
69  ret <2 x double> %r
70}
71
72define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
73; SSE3-LABEL: haddps1:
74; SSE3:       # %bb.0:
75; SSE3-NEXT:    haddps %xmm1, %xmm0
76; SSE3-NEXT:    retq
77;
78; AVX-LABEL: haddps1:
79; AVX:       # %bb.0:
80; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
81; AVX-NEXT:    retq
82  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
83  %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
84  %r = fadd <4 x float> %a, %b
85  ret <4 x float> %r
86}
87
88define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
89; SSE3-LABEL: haddps2:
90; SSE3:       # %bb.0:
91; SSE3-NEXT:    haddps %xmm1, %xmm0
92; SSE3-NEXT:    retq
93;
94; AVX-LABEL: haddps2:
95; AVX:       # %bb.0:
96; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
97; AVX-NEXT:    retq
98  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
99  %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
100  %r = fadd <4 x float> %a, %b
101  ret <4 x float> %r
102}
103
104define <4 x float> @haddps3(<4 x float> %x) {
105; SSE3-LABEL: haddps3:
106; SSE3:       # %bb.0:
107; SSE3-NEXT:    haddps %xmm0, %xmm0
108; SSE3-NEXT:    retq
109;
110; AVX-LABEL: haddps3:
111; AVX:       # %bb.0:
112; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
113; AVX-NEXT:    retq
114  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
115  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
116  %r = fadd <4 x float> %a, %b
117  ret <4 x float> %r
118}
119
120define <4 x float> @haddps4(<4 x float> %x) {
121; SSE3-LABEL: haddps4:
122; SSE3:       # %bb.0:
123; SSE3-NEXT:    haddps %xmm0, %xmm0
124; SSE3-NEXT:    retq
125;
126; AVX-LABEL: haddps4:
127; AVX:       # %bb.0:
128; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
129; AVX-NEXT:    retq
130  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
131  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
132  %r = fadd <4 x float> %a, %b
133  ret <4 x float> %r
134}
135
136define <4 x float> @haddps5(<4 x float> %x) {
137; SSE3-LABEL: haddps5:
138; SSE3:       # %bb.0:
139; SSE3-NEXT:    haddps %xmm0, %xmm0
140; SSE3-NEXT:    retq
141;
142; AVX-LABEL: haddps5:
143; AVX:       # %bb.0:
144; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
145; AVX-NEXT:    retq
146  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
147  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
148  %r = fadd <4 x float> %a, %b
149  ret <4 x float> %r
150}
151
152define <4 x float> @haddps6(<4 x float> %x) {
153; SSE3-SLOW-LABEL: haddps6:
154; SSE3-SLOW:       # %bb.0:
155; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
156; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
157; SSE3-SLOW-NEXT:    retq
158;
159; SSE3-FAST-LABEL: haddps6:
160; SSE3-FAST:       # %bb.0:
161; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
162; SSE3-FAST-NEXT:    retq
163;
164; AVX-SLOW-LABEL: haddps6:
165; AVX-SLOW:       # %bb.0:
166; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
167; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
168; AVX-SLOW-NEXT:    retq
169;
170; AVX-FAST-LABEL: haddps6:
171; AVX-FAST:       # %bb.0:
172; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
173; AVX-FAST-NEXT:    retq
174  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
175  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
176  %r = fadd <4 x float> %a, %b
177  ret <4 x float> %r
178}
179
180define <4 x float> @haddps7(<4 x float> %x) {
181; SSE3-LABEL: haddps7:
182; SSE3:       # %bb.0:
183; SSE3-NEXT:    haddps %xmm0, %xmm0
184; SSE3-NEXT:    retq
185;
186; AVX-LABEL: haddps7:
187; AVX:       # %bb.0:
188; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
189; AVX-NEXT:    retq
190  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
191  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
192  %r = fadd <4 x float> %a, %b
193  ret <4 x float> %r
194}
195
196define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
197; SSE3-LABEL: hsubpd1:
198; SSE3:       # %bb.0:
199; SSE3-NEXT:    hsubpd %xmm1, %xmm0
200; SSE3-NEXT:    retq
201;
202; AVX-LABEL: hsubpd1:
203; AVX:       # %bb.0:
204; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
205; AVX-NEXT:    retq
206  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
207  %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
208  %r = fsub <2 x double> %a, %b
209  ret <2 x double> %r
210}
211
212define <2 x double> @hsubpd2(<2 x double> %x) {
213; SSE3-SLOW-LABEL: hsubpd2:
214; SSE3-SLOW:       # %bb.0:
215; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
216; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
217; SSE3-SLOW-NEXT:    subpd %xmm1, %xmm0
218; SSE3-SLOW-NEXT:    retq
219;
220; SSE3-FAST-LABEL: hsubpd2:
221; SSE3-FAST:       # %bb.0:
222; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
223; SSE3-FAST-NEXT:    retq
224;
225; AVX-SLOW-LABEL: hsubpd2:
226; AVX-SLOW:       # %bb.0:
227; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
228; AVX-SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
229; AVX-SLOW-NEXT:    retq
230;
231; AVX-FAST-LABEL: hsubpd2:
232; AVX-FAST:       # %bb.0:
233; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
234; AVX-FAST-NEXT:    retq
235  %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
236  %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
237  %r = fsub <2 x double> %a, %b
238  ret <2 x double> %r
239}
240
241define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
242; SSE3-LABEL: hsubps1:
243; SSE3:       # %bb.0:
244; SSE3-NEXT:    hsubps %xmm1, %xmm0
245; SSE3-NEXT:    retq
246;
247; AVX-LABEL: hsubps1:
248; AVX:       # %bb.0:
249; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
250; AVX-NEXT:    retq
251  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
252  %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
253  %r = fsub <4 x float> %a, %b
254  ret <4 x float> %r
255}
256
257define <4 x float> @hsubps2(<4 x float> %x) {
258; SSE3-LABEL: hsubps2:
259; SSE3:       # %bb.0:
260; SSE3-NEXT:    hsubps %xmm0, %xmm0
261; SSE3-NEXT:    retq
262;
263; AVX-LABEL: hsubps2:
264; AVX:       # %bb.0:
265; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
266; AVX-NEXT:    retq
267  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
268  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
269  %r = fsub <4 x float> %a, %b
270  ret <4 x float> %r
271}
272
273define <4 x float> @hsubps3(<4 x float> %x) {
274; SSE3-LABEL: hsubps3:
275; SSE3:       # %bb.0:
276; SSE3-NEXT:    hsubps %xmm0, %xmm0
277; SSE3-NEXT:    retq
278;
279; AVX-LABEL: hsubps3:
280; AVX:       # %bb.0:
281; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
282; AVX-NEXT:    retq
283  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
284  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
285  %r = fsub <4 x float> %a, %b
286  ret <4 x float> %r
287}
288
289define <4 x float> @hsubps4(<4 x float> %x) {
290; SSE3-SLOW-LABEL: hsubps4:
291; SSE3-SLOW:       # %bb.0:
292; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
293; SSE3-SLOW-NEXT:    subps %xmm1, %xmm0
294; SSE3-SLOW-NEXT:    retq
295;
296; SSE3-FAST-LABEL: hsubps4:
297; SSE3-FAST:       # %bb.0:
298; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
299; SSE3-FAST-NEXT:    retq
300;
301; AVX-SLOW-LABEL: hsubps4:
302; AVX-SLOW:       # %bb.0:
303; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
304; AVX-SLOW-NEXT:    vsubps %xmm1, %xmm0, %xmm0
305; AVX-SLOW-NEXT:    retq
306;
307; AVX-FAST-LABEL: hsubps4:
308; AVX-FAST:       # %bb.0:
309; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
310; AVX-FAST-NEXT:    retq
311  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
312  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
313  %r = fsub <4 x float> %a, %b
314  ret <4 x float> %r
315}
316
317define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
318; SSE3-LABEL: vhaddps1:
319; SSE3:       # %bb.0:
320; SSE3-NEXT:    haddps %xmm2, %xmm0
321; SSE3-NEXT:    haddps %xmm3, %xmm1
322; SSE3-NEXT:    retq
323;
324; AVX-LABEL: vhaddps1:
325; AVX:       # %bb.0:
326; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
327; AVX-NEXT:    retq
328  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
329  %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
330  %r = fadd <8 x float> %a, %b
331  ret <8 x float> %r
332}
333
334define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
335; SSE3-LABEL: vhaddps2:
336; SSE3:       # %bb.0:
337; SSE3-NEXT:    haddps %xmm2, %xmm0
338; SSE3-NEXT:    haddps %xmm3, %xmm1
339; SSE3-NEXT:    retq
340;
341; AVX-LABEL: vhaddps2:
342; AVX:       # %bb.0:
343; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
344; AVX-NEXT:    retq
345  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
346  %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
347  %r = fadd <8 x float> %a, %b
348  ret <8 x float> %r
349}
350
351define <8 x float> @vhaddps3(<8 x float> %x) {
352; SSE3-LABEL: vhaddps3:
353; SSE3:       # %bb.0:
354; SSE3-NEXT:    haddps %xmm0, %xmm0
355; SSE3-NEXT:    haddps %xmm1, %xmm1
356; SSE3-NEXT:    retq
357;
358; AVX-LABEL: vhaddps3:
359; AVX:       # %bb.0:
360; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
361; AVX-NEXT:    retq
362  %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
363  %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
364  %r = fadd <8 x float> %a, %b
365  ret <8 x float> %r
366}
367
368define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
369; SSE3-LABEL: vhsubps1:
370; SSE3:       # %bb.0:
371; SSE3-NEXT:    hsubps %xmm2, %xmm0
372; SSE3-NEXT:    hsubps %xmm3, %xmm1
373; SSE3-NEXT:    retq
374;
375; AVX-LABEL: vhsubps1:
376; AVX:       # %bb.0:
377; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
378; AVX-NEXT:    retq
379  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
380  %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
381  %r = fsub <8 x float> %a, %b
382  ret <8 x float> %r
383}
384
385define <8 x float> @vhsubps3(<8 x float> %x) {
386; SSE3-LABEL: vhsubps3:
387; SSE3:       # %bb.0:
388; SSE3-NEXT:    hsubps %xmm0, %xmm0
389; SSE3-NEXT:    hsubps %xmm1, %xmm1
390; SSE3-NEXT:    retq
391;
392; AVX-LABEL: vhsubps3:
393; AVX:       # %bb.0:
394; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
395; AVX-NEXT:    retq
396  %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
397  %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
398  %r = fsub <8 x float> %a, %b
399  ret <8 x float> %r
400}
401
402define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
403; SSE3-LABEL: vhaddpd1:
404; SSE3:       # %bb.0:
405; SSE3-NEXT:    haddpd %xmm2, %xmm0
406; SSE3-NEXT:    haddpd %xmm3, %xmm1
407; SSE3-NEXT:    retq
408;
409; AVX-LABEL: vhaddpd1:
410; AVX:       # %bb.0:
411; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
412; AVX-NEXT:    retq
413  %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
414  %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
415  %r = fadd <4 x double> %a, %b
416  ret <4 x double> %r
417}
418
419define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
420; SSE3-LABEL: vhsubpd1:
421; SSE3:       # %bb.0:
422; SSE3-NEXT:    hsubpd %xmm2, %xmm0
423; SSE3-NEXT:    hsubpd %xmm3, %xmm1
424; SSE3-NEXT:    retq
425;
426; AVX-LABEL: vhsubpd1:
427; AVX:       # %bb.0:
428; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
429; AVX-NEXT:    retq
430  %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
431  %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
432  %r = fsub <4 x double> %a, %b
433  ret <4 x double> %r
434}
435
436define <2 x float> @haddps_v2f32(<4 x float> %v0) {
437; SSE3-LABEL: haddps_v2f32:
438; SSE3:       # %bb.0:
439; SSE3-NEXT:    haddps %xmm0, %xmm0
440; SSE3-NEXT:    retq
441;
442; AVX-LABEL: haddps_v2f32:
443; AVX:       # %bb.0:
444; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
445; AVX-NEXT:    retq
446  %v0.0 = extractelement <4 x float> %v0, i32 0
447  %v0.1 = extractelement <4 x float> %v0, i32 1
448  %v0.2 = extractelement <4 x float> %v0, i32 2
449  %v0.3 = extractelement <4 x float> %v0, i32 3
450  %op0 = fadd float %v0.0, %v0.1
451  %op1 = fadd float %v0.2, %v0.3
452  %res0 = insertelement <2 x float> undef, float %op0, i32 0
453  %res1 = insertelement <2 x float> %res0, float %op1, i32 1
454  ret <2 x float> %res1
455}
456
457; 128-bit vectors, float/double, fadd/fsub
458
459define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
460; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
461; SSE3-SLOW:       # %bb.0:
462; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
463; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
464; SSE3-SLOW-NEXT:    retq
465;
466; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
467; SSE3-FAST:       # %bb.0:
468; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
469; SSE3-FAST-NEXT:    retq
470;
471; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
472; AVX-SLOW:       # %bb.0:
473; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
474; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
475; AVX-SLOW-NEXT:    retq
476;
477; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
478; AVX-FAST:       # %bb.0:
479; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
480; AVX-FAST-NEXT:    retq
481  %x0 = extractelement <4 x float> %x, i32 0
482  %x1 = extractelement <4 x float> %x, i32 1
483  %x01 = fadd float %x0, %x1
484  ret float %x01
485}
486
487define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
488; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
489; SSE3-SLOW:       # %bb.0:
490; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
491; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
492; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
493; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
494; SSE3-SLOW-NEXT:    retq
495;
496; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
497; SSE3-FAST:       # %bb.0:
498; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
499; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
500; SSE3-FAST-NEXT:    retq
501;
502; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
503; AVX-SLOW:       # %bb.0:
504; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
505; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
506; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
507; AVX-SLOW-NEXT:    retq
508;
509; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
510; AVX-FAST:       # %bb.0:
511; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
512; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
513; AVX-FAST-NEXT:    retq
514  %x0 = extractelement <4 x float> %x, i32 2
515  %x1 = extractelement <4 x float> %x, i32 3
516  %x01 = fadd float %x0, %x1
517  ret float %x01
518}
519
520define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
521; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
522; SSE3-SLOW:       # %bb.0:
523; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
524; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
525; SSE3-SLOW-NEXT:    retq
526;
527; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
528; SSE3-FAST:       # %bb.0:
529; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
530; SSE3-FAST-NEXT:    retq
531;
532; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
533; AVX-SLOW:       # %bb.0:
534; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
535; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
536; AVX-SLOW-NEXT:    retq
537;
538; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
539; AVX-FAST:       # %bb.0:
540; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
541; AVX-FAST-NEXT:    retq
542  %x0 = extractelement <4 x float> %x, i32 0
543  %x1 = extractelement <4 x float> %x, i32 1
544  %x01 = fadd float %x1, %x0
545  ret float %x01
546}
547
548define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
549; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
550; SSE3-SLOW:       # %bb.0:
551; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
552; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
553; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
554; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
555; SSE3-SLOW-NEXT:    retq
556;
557; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
558; SSE3-FAST:       # %bb.0:
559; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
560; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
561; SSE3-FAST-NEXT:    retq
562;
563; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
564; AVX-SLOW:       # %bb.0:
565; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
566; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
567; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
568; AVX-SLOW-NEXT:    retq
569;
570; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
571; AVX-FAST:       # %bb.0:
572; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
573; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
574; AVX-FAST-NEXT:    retq
575  %x0 = extractelement <4 x float> %x, i32 2
576  %x1 = extractelement <4 x float> %x, i32 3
577  %x01 = fadd float %x1, %x0
578  ret float %x01
579}
580
581define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
582; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
583; SSE3-SLOW:       # %bb.0:
584; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
585; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
586; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
587; SSE3-SLOW-NEXT:    retq
588;
589; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
590; SSE3-FAST:       # %bb.0:
591; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
592; SSE3-FAST-NEXT:    retq
593;
594; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
595; AVX-SLOW:       # %bb.0:
596; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
597; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
598; AVX-SLOW-NEXT:    retq
599;
600; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
601; AVX-FAST:       # %bb.0:
602; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
603; AVX-FAST-NEXT:    retq
604  %x0 = extractelement <2 x double> %x, i32 0
605  %x1 = extractelement <2 x double> %x, i32 1
606  %x01 = fadd double %x0, %x1
607  ret double %x01
608}
609
610define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
611; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
612; SSE3-SLOW:       # %bb.0:
613; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
614; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
615; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
616; SSE3-SLOW-NEXT:    retq
617;
618; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
619; SSE3-FAST:       # %bb.0:
620; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
621; SSE3-FAST-NEXT:    retq
622;
623; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
624; AVX-SLOW:       # %bb.0:
625; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
626; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
627; AVX-SLOW-NEXT:    retq
628;
629; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
630; AVX-FAST:       # %bb.0:
631; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
632; AVX-FAST-NEXT:    retq
633  %x0 = extractelement <2 x double> %x, i32 0
634  %x1 = extractelement <2 x double> %x, i32 1
635  %x01 = fadd double %x1, %x0
636  ret double %x01
637}
638
639define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
640; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
641; SSE3-SLOW:       # %bb.0:
642; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
643; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
644; SSE3-SLOW-NEXT:    retq
645;
646; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
647; SSE3-FAST:       # %bb.0:
648; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
649; SSE3-FAST-NEXT:    retq
650;
651; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
652; AVX-SLOW:       # %bb.0:
653; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
654; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
655; AVX-SLOW-NEXT:    retq
656;
657; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
658; AVX-FAST:       # %bb.0:
659; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
660; AVX-FAST-NEXT:    retq
661  %x0 = extractelement <4 x float> %x, i32 0
662  %x1 = extractelement <4 x float> %x, i32 1
663  %x01 = fsub float %x0, %x1
664  ret float %x01
665}
666
667define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
668; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
669; SSE3-SLOW:       # %bb.0:
670; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
671; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
672; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
673; SSE3-SLOW-NEXT:    subss %xmm0, %xmm1
674; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
675; SSE3-SLOW-NEXT:    retq
676;
677; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
678; SSE3-FAST:       # %bb.0:
679; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
680; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
681; SSE3-FAST-NEXT:    retq
682;
683; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
684; AVX-SLOW:       # %bb.0:
685; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
686; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
687; AVX-SLOW-NEXT:    vsubss %xmm0, %xmm1, %xmm0
688; AVX-SLOW-NEXT:    retq
689;
690; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
691; AVX-FAST:       # %bb.0:
692; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
693; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
694; AVX-FAST-NEXT:    retq
695  %x0 = extractelement <4 x float> %x, i32 2
696  %x1 = extractelement <4 x float> %x, i32 3
697  %x01 = fsub float %x0, %x1
698  ret float %x01
699}
700
701define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
702; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute:
703; SSE3:       # %bb.0:
704; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
705; SSE3-NEXT:    subss %xmm0, %xmm1
706; SSE3-NEXT:    movaps %xmm1, %xmm0
707; SSE3-NEXT:    retq
708;
709; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute:
710; AVX:       # %bb.0:
711; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
712; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
713; AVX-NEXT:    retq
714  %x0 = extractelement <4 x float> %x, i32 0
715  %x1 = extractelement <4 x float> %x, i32 1
716  %x01 = fsub float %x1, %x0
717  ret float %x01
718}
719
720define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
721; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute:
722; SSE3:       # %bb.0:
723; SSE3-NEXT:    movaps %xmm0, %xmm1
724; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
725; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
726; SSE3-NEXT:    subss %xmm1, %xmm0
727; SSE3-NEXT:    retq
728;
729; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
730; AVX:       # %bb.0:
731; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
732; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
733; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
734; AVX-NEXT:    retq
735  %x0 = extractelement <4 x float> %x, i32 2
736  %x1 = extractelement <4 x float> %x, i32 3
737  %x01 = fsub float %x1, %x0
738  ret float %x01
739}
740
741define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
742; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
743; SSE3-SLOW:       # %bb.0:
744; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
745; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
746; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
747; SSE3-SLOW-NEXT:    retq
748;
749; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
750; SSE3-FAST:       # %bb.0:
751; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
752; SSE3-FAST-NEXT:    retq
753;
754; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
755; AVX-SLOW:       # %bb.0:
756; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
757; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
758; AVX-SLOW-NEXT:    retq
759;
760; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
761; AVX-FAST:       # %bb.0:
762; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
763; AVX-FAST-NEXT:    retq
764  %x0 = extractelement <2 x double> %x, i32 0
765  %x1 = extractelement <2 x double> %x, i32 1
766  %x01 = fsub double %x0, %x1
767  ret double %x01
768}
769
770define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
771; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute:
772; SSE3:       # %bb.0:
773; SSE3-NEXT:    movapd %xmm0, %xmm1
774; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
775; SSE3-NEXT:    subsd %xmm0, %xmm1
776; SSE3-NEXT:    movapd %xmm1, %xmm0
777; SSE3-NEXT:    retq
778;
779; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute:
780; AVX:       # %bb.0:
781; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
782; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
783; AVX-NEXT:    retq
784  %x0 = extractelement <2 x double> %x, i32 0
785  %x1 = extractelement <2 x double> %x, i32 1
786  %x01 = fsub double %x1, %x0
787  ret double %x01
788}
789
790; 256-bit vectors, float/double, fadd/fsub
791
792define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
793; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
794; SSE3-SLOW:       # %bb.0:
795; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
796; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
797; SSE3-SLOW-NEXT:    retq
798;
799; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
800; SSE3-FAST:       # %bb.0:
801; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
802; SSE3-FAST-NEXT:    retq
803;
804; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
805; AVX-SLOW:       # %bb.0:
806; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
807; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
808; AVX-SLOW-NEXT:    vzeroupper
809; AVX-SLOW-NEXT:    retq
810;
811; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
812; AVX-FAST:       # %bb.0:
813; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
814; AVX-FAST-NEXT:    vzeroupper
815; AVX-FAST-NEXT:    retq
816  %x0 = extractelement <8 x float> %x, i32 0
817  %x1 = extractelement <8 x float> %x, i32 1
818  %x01 = fadd float %x0, %x1
819  ret float %x01
820}
821
822define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
823; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
824; SSE3-SLOW:       # %bb.0:
825; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
826; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
827; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
828; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
829; SSE3-SLOW-NEXT:    retq
830;
831; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
832; SSE3-FAST:       # %bb.0:
833; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
834; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
835; SSE3-FAST-NEXT:    retq
836;
837; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
838; AVX-SLOW:       # %bb.0:
839; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
840; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
841; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
842; AVX-SLOW-NEXT:    vzeroupper
843; AVX-SLOW-NEXT:    retq
844;
845; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
846; AVX-FAST:       # %bb.0:
847; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
848; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
849; AVX-FAST-NEXT:    vzeroupper
850; AVX-FAST-NEXT:    retq
851  %x0 = extractelement <8 x float> %x, i32 2
852  %x1 = extractelement <8 x float> %x, i32 3
853  %x01 = fadd float %x0, %x1
854  ret float %x01
855}
856
857define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
858; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
859; SSE3-SLOW:       # %bb.0:
860; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
861; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
862; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
863; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
864; SSE3-SLOW-NEXT:    retq
865;
866; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
867; SSE3-FAST:       # %bb.0:
868; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
869; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
870; SSE3-FAST-NEXT:    retq
871;
872; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
873; AVX-SLOW:       # %bb.0:
874; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
875; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
876; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
877; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
878; AVX-SLOW-NEXT:    vzeroupper
879; AVX-SLOW-NEXT:    retq
880;
881; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
882; AVX-FAST:       # %bb.0:
883; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
884; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
885; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
886; AVX-FAST-NEXT:    vzeroupper
887; AVX-FAST-NEXT:    retq
888  %x0 = extractelement <8 x float> %x, i32 6
889  %x1 = extractelement <8 x float> %x, i32 7
890  %x01 = fadd float %x0, %x1
891  ret float %x01
892}
893
894define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
895; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
896; SSE3-SLOW:       # %bb.0:
897; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
898; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
899; SSE3-SLOW-NEXT:    retq
900;
901; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
902; SSE3-FAST:       # %bb.0:
903; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
904; SSE3-FAST-NEXT:    retq
905;
906; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
907; AVX-SLOW:       # %bb.0:
908; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
909; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
910; AVX-SLOW-NEXT:    vzeroupper
911; AVX-SLOW-NEXT:    retq
912;
913; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
914; AVX-FAST:       # %bb.0:
915; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
916; AVX-FAST-NEXT:    vzeroupper
917; AVX-FAST-NEXT:    retq
918  %x0 = extractelement <8 x float> %x, i32 0
919  %x1 = extractelement <8 x float> %x, i32 1
920  %x01 = fadd float %x1, %x0
921  ret float %x01
922}
923
924define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
925; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
926; SSE3-SLOW:       # %bb.0:
927; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
928; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
929; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
930; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
931; SSE3-SLOW-NEXT:    retq
932;
933; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
934; SSE3-FAST:       # %bb.0:
935; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
936; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
937; SSE3-FAST-NEXT:    retq
938;
939; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
940; AVX-SLOW:       # %bb.0:
941; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
942; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
943; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
944; AVX-SLOW-NEXT:    vzeroupper
945; AVX-SLOW-NEXT:    retq
946;
947; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
948; AVX-FAST:       # %bb.0:
949; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
950; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
951; AVX-FAST-NEXT:    vzeroupper
952; AVX-FAST-NEXT:    retq
953  %x0 = extractelement <8 x float> %x, i32 2
954  %x1 = extractelement <8 x float> %x, i32 3
955  %x01 = fadd float %x1, %x0
956  ret float %x01
957}
958
959define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
960; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
961; SSE3-SLOW:       # %bb.0:
962; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
963; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
964; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
965; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
966; SSE3-SLOW-NEXT:    retq
967;
968; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
969; SSE3-FAST:       # %bb.0:
970; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
971; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
972; SSE3-FAST-NEXT:    retq
973;
974; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
975; AVX-SLOW:       # %bb.0:
976; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
977; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
978; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
979; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
980; AVX-SLOW-NEXT:    vzeroupper
981; AVX-SLOW-NEXT:    retq
982;
983; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
984; AVX-FAST:       # %bb.0:
985; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
986; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
987; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
988; AVX-FAST-NEXT:    vzeroupper
989; AVX-FAST-NEXT:    retq
990  %x0 = extractelement <8 x float> %x, i32 6
991  %x1 = extractelement <8 x float> %x, i32 7
992  %x01 = fadd float %x1, %x0
993  ret float %x01
994}
995
996define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
997; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
998; SSE3-SLOW:       # %bb.0:
999; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1000; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1001; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1002; SSE3-SLOW-NEXT:    retq
1003;
1004; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1005; SSE3-FAST:       # %bb.0:
1006; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1007; SSE3-FAST-NEXT:    retq
1008;
1009; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1010; AVX-SLOW:       # %bb.0:
1011; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1012; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1013; AVX-SLOW-NEXT:    vzeroupper
1014; AVX-SLOW-NEXT:    retq
1015;
1016; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1017; AVX-FAST:       # %bb.0:
1018; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1019; AVX-FAST-NEXT:    vzeroupper
1020; AVX-FAST-NEXT:    retq
1021  %x0 = extractelement <4 x double> %x, i32 0
1022  %x1 = extractelement <4 x double> %x, i32 1
1023  %x01 = fadd double %x0, %x1
1024  ret double %x01
1025}
1026
1027define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
1028; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1029; SSE3-SLOW:       # %bb.0:
1030; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1031; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1032; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1033; SSE3-SLOW-NEXT:    retq
1034;
1035; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1036; SSE3-FAST:       # %bb.0:
1037; SSE3-FAST-NEXT:    movapd %xmm1, %xmm0
1038; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
1039; SSE3-FAST-NEXT:    retq
1040;
1041; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1042; AVX-SLOW:       # %bb.0:
1043; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1044; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1045; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1046; AVX-SLOW-NEXT:    vzeroupper
1047; AVX-SLOW-NEXT:    retq
1048;
1049; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1050; AVX-FAST:       # %bb.0:
1051; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1052; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1053; AVX-FAST-NEXT:    vzeroupper
1054; AVX-FAST-NEXT:    retq
1055  %x0 = extractelement <4 x double> %x, i32 2
1056  %x1 = extractelement <4 x double> %x, i32 3
1057  %x01 = fadd double %x0, %x1
1058  ret double %x01
1059}
1060
1061define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
1062; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1063; SSE3-SLOW:       # %bb.0:
1064; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1065; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1066; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1067; SSE3-SLOW-NEXT:    retq
1068;
1069; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1070; SSE3-FAST:       # %bb.0:
1071; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1072; SSE3-FAST-NEXT:    retq
1073;
1074; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1075; AVX-SLOW:       # %bb.0:
1076; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1077; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1078; AVX-SLOW-NEXT:    vzeroupper
1079; AVX-SLOW-NEXT:    retq
1080;
1081; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1082; AVX-FAST:       # %bb.0:
1083; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1084; AVX-FAST-NEXT:    vzeroupper
1085; AVX-FAST-NEXT:    retq
1086  %x0 = extractelement <4 x double> %x, i32 0
1087  %x1 = extractelement <4 x double> %x, i32 1
1088  %x01 = fadd double %x1, %x0
1089  ret double %x01
1090}
1091
1092define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
1093; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1094; SSE3-SLOW:       # %bb.0:
1095; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1096; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1097; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1098; SSE3-SLOW-NEXT:    retq
1099;
1100; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1101; SSE3-FAST:       # %bb.0:
1102; SSE3-FAST-NEXT:    movapd %xmm1, %xmm0
1103; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
1104; SSE3-FAST-NEXT:    retq
1105;
1106; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1107; AVX-SLOW:       # %bb.0:
1108; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1109; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1110; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1111; AVX-SLOW-NEXT:    vzeroupper
1112; AVX-SLOW-NEXT:    retq
1113;
1114; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1115; AVX-FAST:       # %bb.0:
1116; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1117; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1118; AVX-FAST-NEXT:    vzeroupper
1119; AVX-FAST-NEXT:    retq
1120  %x0 = extractelement <4 x double> %x, i32 2
1121  %x1 = extractelement <4 x double> %x, i32 3
1122  %x01 = fadd double %x1, %x0
1123  ret double %x01
1124}
1125
1126define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
1127; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1128; SSE3-SLOW:       # %bb.0:
1129; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1130; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1131; SSE3-SLOW-NEXT:    retq
1132;
1133; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1134; SSE3-FAST:       # %bb.0:
1135; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1136; SSE3-FAST-NEXT:    retq
1137;
1138; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1139; AVX-SLOW:       # %bb.0:
1140; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1141; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1142; AVX-SLOW-NEXT:    vzeroupper
1143; AVX-SLOW-NEXT:    retq
1144;
1145; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1146; AVX-FAST:       # %bb.0:
1147; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1148; AVX-FAST-NEXT:    vzeroupper
1149; AVX-FAST-NEXT:    retq
1150  %x0 = extractelement <8 x float> %x, i32 0
1151  %x1 = extractelement <8 x float> %x, i32 1
1152  %x01 = fsub float %x0, %x1
1153  ret float %x01
1154}
1155
1156define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
1157; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1158; SSE3-SLOW:       # %bb.0:
1159; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1160; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1161; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1162; SSE3-SLOW-NEXT:    subss %xmm0, %xmm1
1163; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1164; SSE3-SLOW-NEXT:    retq
1165;
1166; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1167; SSE3-FAST:       # %bb.0:
1168; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1169; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1170; SSE3-FAST-NEXT:    retq
1171;
1172; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1173; AVX-SLOW:       # %bb.0:
1174; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1175; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1176; AVX-SLOW-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1177; AVX-SLOW-NEXT:    vzeroupper
1178; AVX-SLOW-NEXT:    retq
1179;
1180; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1181; AVX-FAST:       # %bb.0:
1182; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1183; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1184; AVX-FAST-NEXT:    vzeroupper
1185; AVX-FAST-NEXT:    retq
1186  %x0 = extractelement <8 x float> %x, i32 2
1187  %x1 = extractelement <8 x float> %x, i32 3
1188  %x01 = fsub float %x0, %x1
1189  ret float %x01
1190}
1191
1192define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
1193; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1194; SSE3-SLOW:       # %bb.0:
1195; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1196; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1197; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1198; SSE3-SLOW-NEXT:    retq
1199;
1200; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1201; SSE3-FAST:       # %bb.0:
1202; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1203; SSE3-FAST-NEXT:    hsubps %xmm1, %xmm0
1204; SSE3-FAST-NEXT:    retq
1205;
1206; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1207; AVX-SLOW:       # %bb.0:
1208; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1209; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1210; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1211; AVX-SLOW-NEXT:    vzeroupper
1212; AVX-SLOW-NEXT:    retq
1213;
1214; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1215; AVX-FAST:       # %bb.0:
1216; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1217; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1218; AVX-FAST-NEXT:    vzeroupper
1219; AVX-FAST-NEXT:    retq
1220  %x0 = extractelement <8 x float> %x, i32 4
1221  %x1 = extractelement <8 x float> %x, i32 5
1222  %x01 = fsub float %x0, %x1
1223  ret float %x01
1224}
1225
1226; Negative test...or get hoppy and negate?
1227
1228define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
1229; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1230; SSE3:       # %bb.0:
1231; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1232; SSE3-NEXT:    subss %xmm0, %xmm1
1233; SSE3-NEXT:    movaps %xmm1, %xmm0
1234; SSE3-NEXT:    retq
1235;
1236; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1237; AVX:       # %bb.0:
1238; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1239; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1240; AVX-NEXT:    vzeroupper
1241; AVX-NEXT:    retq
1242  %x0 = extractelement <8 x float> %x, i32 0
1243  %x1 = extractelement <8 x float> %x, i32 1
1244  %x01 = fsub float %x1, %x0
1245  ret float %x01
1246}
1247
1248define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
1249; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1250; SSE3-SLOW:       # %bb.0:
1251; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1252; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1253; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1254; SSE3-SLOW-NEXT:    retq
1255;
1256; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1257; SSE3-FAST:       # %bb.0:
1258; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1259; SSE3-FAST-NEXT:    retq
1260;
1261; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1262; AVX-SLOW:       # %bb.0:
1263; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1264; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1265; AVX-SLOW-NEXT:    vzeroupper
1266; AVX-SLOW-NEXT:    retq
1267;
1268; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1269; AVX-FAST:       # %bb.0:
1270; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1271; AVX-FAST-NEXT:    vzeroupper
1272; AVX-FAST-NEXT:    retq
1273  %x0 = extractelement <4 x double> %x, i32 0
1274  %x1 = extractelement <4 x double> %x, i32 1
1275  %x01 = fsub double %x0, %x1
1276  ret double %x01
1277}
1278
1279; Negative test...or get hoppy and negate?
1280
1281define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
1282; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1283; SSE3:       # %bb.0:
1284; SSE3-NEXT:    movapd %xmm0, %xmm1
1285; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1286; SSE3-NEXT:    subsd %xmm0, %xmm1
1287; SSE3-NEXT:    movapd %xmm1, %xmm0
1288; SSE3-NEXT:    retq
1289;
1290; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1291; AVX:       # %bb.0:
1292; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1293; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1294; AVX-NEXT:    vzeroupper
1295; AVX-NEXT:    retq
1296  %x0 = extractelement <4 x double> %x, i32 0
1297  %x1 = extractelement <4 x double> %x, i32 1
1298  %x01 = fsub double %x1, %x0
1299  ret double %x01
1300}
1301
1302; 512-bit vectors, float/double, fadd/fsub
1303
1304define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
1305; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1306; SSE3-SLOW:       # %bb.0:
1307; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1308; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1309; SSE3-SLOW-NEXT:    retq
1310;
1311; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1312; SSE3-FAST:       # %bb.0:
1313; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1314; SSE3-FAST-NEXT:    retq
1315;
1316; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1317; AVX-SLOW:       # %bb.0:
1318; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1319; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1320; AVX-SLOW-NEXT:    vzeroupper
1321; AVX-SLOW-NEXT:    retq
1322;
1323; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1324; AVX-FAST:       # %bb.0:
1325; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1326; AVX-FAST-NEXT:    vzeroupper
1327; AVX-FAST-NEXT:    retq
1328  %x0 = extractelement <16 x float> %x, i32 0
1329  %x1 = extractelement <16 x float> %x, i32 1
1330  %x01 = fadd float %x0, %x1
1331  ret float %x01
1332}
1333
1334define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
1335; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1336; SSE3-SLOW:       # %bb.0:
1337; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1338; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1339; SSE3-SLOW-NEXT:    retq
1340;
1341; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1342; SSE3-FAST:       # %bb.0:
1343; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1344; SSE3-FAST-NEXT:    retq
1345;
1346; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1347; AVX-SLOW:       # %bb.0:
1348; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1349; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1350; AVX-SLOW-NEXT:    vzeroupper
1351; AVX-SLOW-NEXT:    retq
1352;
1353; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1354; AVX-FAST:       # %bb.0:
1355; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1356; AVX-FAST-NEXT:    vzeroupper
1357; AVX-FAST-NEXT:    retq
1358  %x0 = extractelement <16 x float> %x, i32 0
1359  %x1 = extractelement <16 x float> %x, i32 1
1360  %x01 = fadd float %x1, %x0
1361  ret float %x01
1362}
1363
1364define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
1365; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1366; SSE3-SLOW:       # %bb.0:
1367; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1368; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1369; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1370; SSE3-SLOW-NEXT:    retq
1371;
1372; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1373; SSE3-FAST:       # %bb.0:
1374; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1375; SSE3-FAST-NEXT:    retq
1376;
1377; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1378; AVX-SLOW:       # %bb.0:
1379; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1380; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1381; AVX-SLOW-NEXT:    vzeroupper
1382; AVX-SLOW-NEXT:    retq
1383;
1384; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1385; AVX-FAST:       # %bb.0:
1386; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1387; AVX-FAST-NEXT:    vzeroupper
1388; AVX-FAST-NEXT:    retq
1389  %x0 = extractelement <8 x double> %x, i32 0
1390  %x1 = extractelement <8 x double> %x, i32 1
1391  %x01 = fadd double %x0, %x1
1392  ret double %x01
1393}
1394
1395define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
1396; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1397; SSE3-SLOW:       # %bb.0:
1398; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1399; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1400; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1401; SSE3-SLOW-NEXT:    retq
1402;
1403; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1404; SSE3-FAST:       # %bb.0:
1405; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1406; SSE3-FAST-NEXT:    retq
1407;
1408; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1409; AVX-SLOW:       # %bb.0:
1410; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1411; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1412; AVX-SLOW-NEXT:    vzeroupper
1413; AVX-SLOW-NEXT:    retq
1414;
1415; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416; AVX-FAST:       # %bb.0:
1417; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1418; AVX-FAST-NEXT:    vzeroupper
1419; AVX-FAST-NEXT:    retq
1420  %x0 = extractelement <8 x double> %x, i32 0
1421  %x1 = extractelement <8 x double> %x, i32 1
1422  %x01 = fadd double %x1, %x0
1423  ret double %x01
1424}
1425
1426define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
1427; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1428; SSE3-SLOW:       # %bb.0:
1429; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1430; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1431; SSE3-SLOW-NEXT:    retq
1432;
1433; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1434; SSE3-FAST:       # %bb.0:
1435; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1436; SSE3-FAST-NEXT:    retq
1437;
1438; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1439; AVX-SLOW:       # %bb.0:
1440; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1441; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1442; AVX-SLOW-NEXT:    vzeroupper
1443; AVX-SLOW-NEXT:    retq
1444;
1445; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1446; AVX-FAST:       # %bb.0:
1447; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1448; AVX-FAST-NEXT:    vzeroupper
1449; AVX-FAST-NEXT:    retq
1450  %x0 = extractelement <16 x float> %x, i32 0
1451  %x1 = extractelement <16 x float> %x, i32 1
1452  %x01 = fsub float %x0, %x1
1453  ret float %x01
1454}
1455
1456define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
1457; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1458; SSE3:       # %bb.0:
1459; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1460; SSE3-NEXT:    subss %xmm0, %xmm1
1461; SSE3-NEXT:    movaps %xmm1, %xmm0
1462; SSE3-NEXT:    retq
1463;
1464; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1465; AVX:       # %bb.0:
1466; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1467; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1468; AVX-NEXT:    vzeroupper
1469; AVX-NEXT:    retq
1470  %x0 = extractelement <16 x float> %x, i32 0
1471  %x1 = extractelement <16 x float> %x, i32 1
1472  %x01 = fsub float %x1, %x0
1473  ret float %x01
1474}
1475
1476define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
1477; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1478; SSE3-SLOW:       # %bb.0:
1479; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1480; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1481; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1482; SSE3-SLOW-NEXT:    retq
1483;
1484; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1485; SSE3-FAST:       # %bb.0:
1486; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1487; SSE3-FAST-NEXT:    retq
1488;
1489; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1490; AVX-SLOW:       # %bb.0:
1491; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1492; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1493; AVX-SLOW-NEXT:    vzeroupper
1494; AVX-SLOW-NEXT:    retq
1495;
1496; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1497; AVX-FAST:       # %bb.0:
1498; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1499; AVX-FAST-NEXT:    vzeroupper
1500; AVX-FAST-NEXT:    retq
1501  %x0 = extractelement <8 x double> %x, i32 0
1502  %x1 = extractelement <8 x double> %x, i32 1
1503  %x01 = fsub double %x0, %x1
1504  ret double %x01
1505}
1506
1507define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
1508; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1509; SSE3:       # %bb.0:
1510; SSE3-NEXT:    movapd %xmm0, %xmm1
1511; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1512; SSE3-NEXT:    subsd %xmm0, %xmm1
1513; SSE3-NEXT:    movapd %xmm1, %xmm0
1514; SSE3-NEXT:    retq
1515;
1516; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1517; AVX:       # %bb.0:
1518; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1519; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1520; AVX-NEXT:    vzeroupper
1521; AVX-NEXT:    retq
1522  %x0 = extractelement <8 x double> %x, i32 0
1523  %x1 = extractelement <8 x double> %x, i32 1
1524  %x01 = fsub double %x1, %x0
1525  ret double %x01
1526}
1527
1528; Check output when 1 or both extracts have extra uses.
1529
1530define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, ptr %p) {
1531; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1532; SSE3-SLOW:       # %bb.0:
1533; SSE3-SLOW-NEXT:    movss %xmm0, (%rdi)
1534; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1535; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1536; SSE3-SLOW-NEXT:    retq
1537;
1538; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1539; SSE3-FAST:       # %bb.0:
1540; SSE3-FAST-NEXT:    movss %xmm0, (%rdi)
1541; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1542; SSE3-FAST-NEXT:    retq
1543;
1544; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1545; AVX-SLOW:       # %bb.0:
1546; AVX-SLOW-NEXT:    vmovss %xmm0, (%rdi)
1547; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1548; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1549; AVX-SLOW-NEXT:    retq
1550;
1551; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552; AVX-FAST:       # %bb.0:
1553; AVX-FAST-NEXT:    vmovss %xmm0, (%rdi)
1554; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1555; AVX-FAST-NEXT:    retq
1556  %x0 = extractelement <4 x float> %x, i32 0
1557  store float %x0, ptr %p
1558  %x1 = extractelement <4 x float> %x, i32 1
1559  %x01 = fadd float %x0, %x1
1560  ret float %x01
1561}
1562
1563define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, ptr %p) {
1564; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1565; SSE3-SLOW:       # %bb.0:
1566; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1567; SSE3-SLOW-NEXT:    movss %xmm1, (%rdi)
1568; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1569; SSE3-SLOW-NEXT:    retq
1570;
1571; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1572; SSE3-FAST:       # %bb.0:
1573; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1574; SSE3-FAST-NEXT:    movss %xmm1, (%rdi)
1575; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1576; SSE3-FAST-NEXT:    retq
1577;
1578; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1579; AVX-SLOW:       # %bb.0:
1580; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1581; AVX-SLOW-NEXT:    vmovss %xmm1, (%rdi)
1582; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1583; AVX-SLOW-NEXT:    retq
1584;
1585; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586; AVX-FAST:       # %bb.0:
1587; AVX-FAST-NEXT:    vextractps $1, %xmm0, (%rdi)
1588; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1589; AVX-FAST-NEXT:    retq
1590  %x0 = extractelement <4 x float> %x, i32 0
1591  %x1 = extractelement <4 x float> %x, i32 1
1592  store float %x1, ptr %p
1593  %x01 = fadd float %x0, %x1
1594  ret float %x01
1595}
1596
1597define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, ptr %p1, ptr %p2) {
1598; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1599; SSE3:       # %bb.0:
1600; SSE3-NEXT:    movss %xmm0, (%rdi)
1601; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1602; SSE3-NEXT:    movss %xmm1, (%rsi)
1603; SSE3-NEXT:    addss %xmm1, %xmm0
1604; SSE3-NEXT:    retq
1605;
1606; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1607; AVX:       # %bb.0:
1608; AVX-NEXT:    vmovss %xmm0, (%rdi)
1609; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1610; AVX-NEXT:    vmovss %xmm1, (%rsi)
1611; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1612; AVX-NEXT:    retq
1613  %x0 = extractelement <4 x float> %x, i32 0
1614  store float %x0, ptr %p1
1615  %x1 = extractelement <4 x float> %x, i32 1
1616  store float %x1, ptr %p2
1617  %x01 = fadd float %x0, %x1
1618  ret float %x01
1619}
1620
1621; Repeat tests from general reductions to verify output for hoppy targets:
1622; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
1623
1624declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1625declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
1626
1627define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
1628; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1629; SSE3-SLOW:       # %bb.0:
1630; SSE3-SLOW-NEXT:    addps %xmm2, %xmm1
1631; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
1632; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1633; SSE3-SLOW-NEXT:    addps %xmm1, %xmm2
1634; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1635; SSE3-SLOW-NEXT:    addss %xmm2, %xmm1
1636; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1637; SSE3-SLOW-NEXT:    retq
1638;
1639; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1640; SSE3-FAST:       # %bb.0:
1641; SSE3-FAST-NEXT:    haddps %xmm1, %xmm2
1642; SSE3-FAST-NEXT:    haddps %xmm2, %xmm2
1643; SSE3-FAST-NEXT:    haddps %xmm2, %xmm2
1644; SSE3-FAST-NEXT:    addss %xmm2, %xmm0
1645; SSE3-FAST-NEXT:    retq
1646;
1647; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1648; AVX-SLOW:       # %bb.0:
1649; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1650; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1651; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1652; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1653; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1654; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1655; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1656; AVX-SLOW-NEXT:    vzeroupper
1657; AVX-SLOW-NEXT:    retq
1658;
1659; AVX-FAST-LABEL: fadd_reduce_v8f32:
1660; AVX-FAST:       # %bb.0:
1661; AVX-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
1662; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm2, %xmm1
1663; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
1664; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
1665; AVX-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1666; AVX-FAST-NEXT:    vzeroupper
1667; AVX-FAST-NEXT:    retq
1668  %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
1669  ret float %r
1670}
1671
1672define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
1673; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1674; SSE3-SLOW:       # %bb.0:
1675; SSE3-SLOW-NEXT:    addpd %xmm2, %xmm1
1676; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm2
1677; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1678; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm2
1679; SSE3-SLOW-NEXT:    addsd %xmm2, %xmm0
1680; SSE3-SLOW-NEXT:    retq
1681;
1682; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1683; SSE3-FAST:       # %bb.0:
1684; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm2
1685; SSE3-FAST-NEXT:    haddpd %xmm2, %xmm2
1686; SSE3-FAST-NEXT:    addsd %xmm2, %xmm0
1687; SSE3-FAST-NEXT:    retq
1688;
1689; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1690; AVX-SLOW:       # %bb.0:
1691; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1692; AVX-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
1693; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1694; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1695; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1696; AVX-SLOW-NEXT:    vzeroupper
1697; AVX-SLOW-NEXT:    retq
1698;
1699; AVX-FAST-LABEL: fadd_reduce_v4f64:
1700; AVX-FAST:       # %bb.0:
1701; AVX-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
1702; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm2, %xmm1
1703; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
1704; AVX-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1705; AVX-FAST-NEXT:    vzeroupper
1706; AVX-FAST-NEXT:    retq
1707  %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1708  ret double %r
1709}
1710
1711define float @PR39936_v8f32(<8 x float>) {
1712; SSSE3-SLOW-LABEL: PR39936_v8f32:
1713; SSSE3-SLOW:       # %bb.0:
1714; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
1715; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1716; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
1717; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
1718; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
1719; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1720; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1721; SSSE3-SLOW-NEXT:    retq
1722;
1723; SSSE3-FAST-LABEL: PR39936_v8f32:
1724; SSSE3-FAST:       # %bb.0:
1725; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
1726; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1727; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1728; SSSE3-FAST-NEXT:    retq
1729;
1730; SSE3-SLOW-LABEL: PR39936_v8f32:
1731; SSE3-SLOW:       # %bb.0:
1732; SSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
1733; SSE3-SLOW-NEXT:    haddps %xmm0, %xmm0
1734; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1735; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1736; SSE3-SLOW-NEXT:    retq
1737;
1738; SSE3-FAST-LABEL: PR39936_v8f32:
1739; SSE3-FAST:       # %bb.0:
1740; SSE3-FAST-NEXT:    haddps %xmm1, %xmm0
1741; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1742; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1743; SSE3-FAST-NEXT:    retq
1744;
1745; AVX-SLOW-LABEL: PR39936_v8f32:
1746; AVX-SLOW:       # %bb.0:
1747; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1748; AVX-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1749; AVX-SLOW-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1750; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1751; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1752; AVX-SLOW-NEXT:    vzeroupper
1753; AVX-SLOW-NEXT:    retq
1754;
1755; AVX-FAST-LABEL: PR39936_v8f32:
1756; AVX-FAST:       # %bb.0:
1757; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1758; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1759; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1760; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1761; AVX-FAST-NEXT:    vzeroupper
1762; AVX-FAST-NEXT:    retq
1763  %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
1764  %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1765  %4 = fadd <8 x float> %2, %3
1766  %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1767  %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1768  %7 = fadd <8 x float> %5, %6
1769  %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1770  %9 = fadd <8 x float> %7, %8
1771  %10 = extractelement <8 x float> %9, i32 0
1772  ret float %10
1773}
1774
1775define float @hadd32_4(<4 x float> %x225) {
1776; SSE3-SLOW-LABEL: hadd32_4:
1777; SSE3-SLOW:       # %bb.0:
1778; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1779; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1780; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
1781; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1782; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1783; SSE3-SLOW-NEXT:    retq
1784;
1785; SSE3-FAST-LABEL: hadd32_4:
1786; SSE3-FAST:       # %bb.0:
1787; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1788; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1789; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
1790; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1791; SSE3-FAST-NEXT:    retq
1792;
1793; AVX-SLOW-LABEL: hadd32_4:
1794; AVX-SLOW:       # %bb.0:
1795; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1796; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1797; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1798; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1799; AVX-SLOW-NEXT:    retq
1800;
1801; AVX-FAST-LABEL: hadd32_4:
1802; AVX-FAST:       # %bb.0:
1803; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1804; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1805; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1806; AVX-FAST-NEXT:    retq
1807  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1808  %x227 = fadd <4 x float> %x225, %x226
1809  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1810  %x229 = fadd <4 x float> %x227, %x228
1811  %x230 = extractelement <4 x float> %x229, i32 0
1812  ret float %x230
1813}
1814
1815define float @hadd32_8(<8 x float> %x225) {
1816; SSE3-SLOW-LABEL: hadd32_8:
1817; SSE3-SLOW:       # %bb.0:
1818; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1819; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1820; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
1821; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1822; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1823; SSE3-SLOW-NEXT:    retq
1824;
1825; SSE3-FAST-LABEL: hadd32_8:
1826; SSE3-FAST:       # %bb.0:
1827; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1828; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1829; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
1830; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1831; SSE3-FAST-NEXT:    retq
1832;
1833; AVX-SLOW-LABEL: hadd32_8:
1834; AVX-SLOW:       # %bb.0:
1835; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1836; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1837; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1838; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1839; AVX-SLOW-NEXT:    vzeroupper
1840; AVX-SLOW-NEXT:    retq
1841;
1842; AVX-FAST-LABEL: hadd32_8:
1843; AVX-FAST:       # %bb.0:
1844; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1845; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1846; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1847; AVX-FAST-NEXT:    vzeroupper
1848; AVX-FAST-NEXT:    retq
1849  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1850  %x227 = fadd <8 x float> %x225, %x226
1851  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1852  %x229 = fadd <8 x float> %x227, %x228
1853  %x230 = extractelement <8 x float> %x229, i32 0
1854  ret float %x230
1855}
1856
1857define float @hadd32_16(<16 x float> %x225) {
1858; SSE3-SLOW-LABEL: hadd32_16:
1859; SSE3-SLOW:       # %bb.0:
1860; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1861; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1862; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
1863; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1864; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1865; SSE3-SLOW-NEXT:    retq
1866;
1867; SSE3-FAST-LABEL: hadd32_16:
1868; SSE3-FAST:       # %bb.0:
1869; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1870; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1871; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
1872; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1873; SSE3-FAST-NEXT:    retq
1874;
1875; AVX-SLOW-LABEL: hadd32_16:
1876; AVX-SLOW:       # %bb.0:
1877; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1878; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1879; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1880; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1881; AVX-SLOW-NEXT:    vzeroupper
1882; AVX-SLOW-NEXT:    retq
1883;
1884; AVX-FAST-LABEL: hadd32_16:
1885; AVX-FAST:       # %bb.0:
1886; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1887; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1888; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1889; AVX-FAST-NEXT:    vzeroupper
1890; AVX-FAST-NEXT:    retq
1891  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1892  %x227 = fadd <16 x float> %x225, %x226
1893  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1894  %x229 = fadd <16 x float> %x227, %x228
1895  %x230 = extractelement <16 x float> %x229, i32 0
1896  ret float %x230
1897}
1898
1899define float @hadd32_4_optsize(<4 x float> %x225) optsize {
1900; SSE3-LABEL: hadd32_4_optsize:
1901; SSE3:       # %bb.0:
1902; SSE3-NEXT:    movaps %xmm0, %xmm1
1903; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1904; SSE3-NEXT:    addps %xmm1, %xmm0
1905; SSE3-NEXT:    haddps %xmm0, %xmm0
1906; SSE3-NEXT:    retq
1907;
1908; AVX-LABEL: hadd32_4_optsize:
1909; AVX:       # %bb.0:
1910; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1911; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1912; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1913; AVX-NEXT:    retq
1914  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1915  %x227 = fadd <4 x float> %x225, %x226
1916  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1917  %x229 = fadd <4 x float> %x227, %x228
1918  %x230 = extractelement <4 x float> %x229, i32 0
1919  ret float %x230
1920}
1921
1922define float @hadd32_8_optsize(<8 x float> %x225) optsize {
1923; SSE3-LABEL: hadd32_8_optsize:
1924; SSE3:       # %bb.0:
1925; SSE3-NEXT:    movaps %xmm0, %xmm1
1926; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1927; SSE3-NEXT:    addps %xmm1, %xmm0
1928; SSE3-NEXT:    haddps %xmm0, %xmm0
1929; SSE3-NEXT:    retq
1930;
1931; AVX-LABEL: hadd32_8_optsize:
1932; AVX:       # %bb.0:
1933; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1934; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1935; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1936; AVX-NEXT:    vzeroupper
1937; AVX-NEXT:    retq
1938  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1939  %x227 = fadd <8 x float> %x225, %x226
1940  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1941  %x229 = fadd <8 x float> %x227, %x228
1942  %x230 = extractelement <8 x float> %x229, i32 0
1943  ret float %x230
1944}
1945
1946define float @hadd32_16_optsize(<16 x float> %x225) optsize {
1947; SSE3-LABEL: hadd32_16_optsize:
1948; SSE3:       # %bb.0:
1949; SSE3-NEXT:    movaps %xmm0, %xmm1
1950; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1951; SSE3-NEXT:    addps %xmm1, %xmm0
1952; SSE3-NEXT:    haddps %xmm0, %xmm0
1953; SSE3-NEXT:    retq
1954;
1955; AVX-LABEL: hadd32_16_optsize:
1956; AVX:       # %bb.0:
1957; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1958; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1959; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1960; AVX-NEXT:    vzeroupper
1961; AVX-NEXT:    retq
1962  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1963  %x227 = fadd <16 x float> %x225, %x226
1964  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1965  %x229 = fadd <16 x float> %x227, %x228
1966  %x230 = extractelement <16 x float> %x229, i32 0
1967  ret float %x230
1968}
1969
1970define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
1971; SSE3-LABEL: hadd32_4_pgso:
1972; SSE3:       # %bb.0:
1973; SSE3-NEXT:    movaps %xmm0, %xmm1
1974; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1975; SSE3-NEXT:    addps %xmm1, %xmm0
1976; SSE3-NEXT:    haddps %xmm0, %xmm0
1977; SSE3-NEXT:    retq
1978;
1979; AVX-LABEL: hadd32_4_pgso:
1980; AVX:       # %bb.0:
1981; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1982; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1983; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1984; AVX-NEXT:    retq
1985  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1986  %x227 = fadd <4 x float> %x225, %x226
1987  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1988  %x229 = fadd <4 x float> %x227, %x228
1989  %x230 = extractelement <4 x float> %x229, i32 0
1990  ret float %x230
1991}
1992
1993define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
1994; SSE3-LABEL: hadd32_8_pgso:
1995; SSE3:       # %bb.0:
1996; SSE3-NEXT:    movaps %xmm0, %xmm1
1997; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1998; SSE3-NEXT:    addps %xmm1, %xmm0
1999; SSE3-NEXT:    haddps %xmm0, %xmm0
2000; SSE3-NEXT:    retq
2001;
2002; AVX-LABEL: hadd32_8_pgso:
2003; AVX:       # %bb.0:
2004; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2005; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2006; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2007; AVX-NEXT:    vzeroupper
2008; AVX-NEXT:    retq
2009  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2010  %x227 = fadd <8 x float> %x225, %x226
2011  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2012  %x229 = fadd <8 x float> %x227, %x228
2013  %x230 = extractelement <8 x float> %x229, i32 0
2014  ret float %x230
2015}
2016
2017define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
2018; SSE3-LABEL: hadd32_16_pgso:
2019; SSE3:       # %bb.0:
2020; SSE3-NEXT:    movaps %xmm0, %xmm1
2021; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2022; SSE3-NEXT:    addps %xmm1, %xmm0
2023; SSE3-NEXT:    haddps %xmm0, %xmm0
2024; SSE3-NEXT:    retq
2025;
2026; AVX-LABEL: hadd32_16_pgso:
2027; AVX:       # %bb.0:
2028; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2029; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2030; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2031; AVX-NEXT:    vzeroupper
2032; AVX-NEXT:    retq
2033  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2034  %x227 = fadd <16 x float> %x225, %x226
2035  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2036  %x229 = fadd <16 x float> %x227, %x228
2037  %x230 = extractelement <16 x float> %x229, i32 0
2038  ret float %x230
2039}
2040
2041define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
2042; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
2043; SSE3-SLOW:       # %bb.0:
2044; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
2045; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2046; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
2047; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2048; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
2049; SSE3-SLOW-NEXT:    retq
2050;
2051; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32:
2052; SSE3-FAST:       # %bb.0:
2053; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2054; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2055; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
2056; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
2057; SSE3-FAST-NEXT:    retq
2058;
2059; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
2060; AVX-SLOW:       # %bb.0:
2061; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2062; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2063; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2064; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2065; AVX-SLOW-NEXT:    vzeroupper
2066; AVX-SLOW-NEXT:    retq
2067;
2068; AVX-FAST-LABEL: partial_reduction_fadd_v8f32:
2069; AVX-FAST:       # %bb.0:
2070; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2071; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2072; AVX-FAST-NEXT:    vzeroupper
2073; AVX-FAST-NEXT:    retq
2074  %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2075  %x0213 = fadd <8 x float> %x, %x23
2076  %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2077  %x0123 = fadd nsz reassoc <8 x float> %x0213, %x13
2078  %r = extractelement <8 x float> %x0123, i32 0
2079  ret float %r
2080}
2081
2082; Negative test - only the flags on the final math op in the
2083; sequence determine whether we can transform to horizontal ops.
2084
2085define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
2086; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2087; SSE3-SLOW:       # %bb.0:
2088; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
2089; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2090; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
2091; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2092; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
2093; SSE3-SLOW-NEXT:    retq
2094;
2095; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2096; SSE3-FAST:       # %bb.0:
2097; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2098; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2099; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
2100; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
2101; SSE3-FAST-NEXT:    retq
2102;
2103; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2104; AVX-SLOW:       # %bb.0:
2105; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2106; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2107; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2108; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2109; AVX-SLOW-NEXT:    vzeroupper
2110; AVX-SLOW-NEXT:    retq
2111;
2112; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2113; AVX-FAST:       # %bb.0:
2114; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2115; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2116; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2117; AVX-FAST-NEXT:    vzeroupper
2118; AVX-FAST-NEXT:    retq
2119  %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2120  %x0213 = fadd fast <8 x float> %x, %x23
2121  %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2122  %x0123 = fadd ninf nnan <8 x float> %x0213, %x13
2123  %r = extractelement <8 x float> %x0123, i32 0
2124  ret float %r
2125}
2126
2127define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
2128; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32:
2129; SSE3-SLOW:       # %bb.0:
2130; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
2131; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2132; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
2133; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2134; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
2135; SSE3-SLOW-NEXT:    retq
2136;
2137; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32:
2138; SSE3-FAST:       # %bb.0:
2139; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2140; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2141; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
2142; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
2143; SSE3-FAST-NEXT:    retq
2144;
2145; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
2146; AVX-SLOW:       # %bb.0:
2147; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2148; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2149; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2150; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2151; AVX-SLOW-NEXT:    vzeroupper
2152; AVX-SLOW-NEXT:    retq
2153;
2154; AVX-FAST-LABEL: partial_reduction_fadd_v16f32:
2155; AVX-FAST:       # %bb.0:
2156; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2157; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2158; AVX-FAST-NEXT:    vzeroupper
2159; AVX-FAST-NEXT:    retq
2160  %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2161  %x0213 = fadd <16 x float> %x, %x23
2162  %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2163  %x0123 = fadd reassoc nsz <16 x float> %x0213, %x13
2164  %r = extractelement <16 x float> %x0123, i32 0
2165  ret float %r
2166}
2167
2168!llvm.module.flags = !{!0}
2169!0 = !{i32 1, !"ProfileSummary", !1}
2170!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
2171!2 = !{!"ProfileFormat", !"InstrProf"}
2172!3 = !{!"TotalCount", i64 10000}
2173!4 = !{!"MaxCount", i64 10}
2174!5 = !{!"MaxInternalCount", i64 1}
2175!6 = !{!"MaxFunctionCount", i64 1000}
2176!7 = !{!"NumCounts", i64 3}
2177!8 = !{!"NumFunctions", i64 3}
2178!9 = !{!"DetailedSummary", !10}
2179!10 = !{!11, !12, !13}
2180!11 = !{i32 10000, i64 100, i32 1}
2181!12 = !{i32 999000, i64 100, i32 1}
2182!13 = !{i32 999999, i64 1, i32 2}
2183!14 = !{!"function_entry_count", i64 0}
2184