xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-fadd.ll (revision a2a0089ac3a5781ba74d4d319c87c9e8b46d4eda)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
9
10;
11; vXf32 (accum)
12;
13
14define float @test_v2f32(float %a0, <2 x float> %a1) {
15; SSE2-LABEL: test_v2f32:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    addss %xmm1, %xmm0
18; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
19; SSE2-NEXT:    addss %xmm1, %xmm0
20; SSE2-NEXT:    retq
21;
22; SSE41-LABEL: test_v2f32:
23; SSE41:       # %bb.0:
24; SSE41-NEXT:    addss %xmm1, %xmm0
25; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
26; SSE41-NEXT:    addss %xmm1, %xmm0
27; SSE41-NEXT:    retq
28;
29; AVX-LABEL: test_v2f32:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
32; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
33; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
34; AVX-NEXT:    retq
35;
36; AVX512-LABEL: test_v2f32:
37; AVX512:       # %bb.0:
38; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
39; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
40; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
41; AVX512-NEXT:    retq
42  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
43  ret float %1
44}
45
46define float @test_v4f32(float %a0, <4 x float> %a1) {
47; SSE2-LABEL: test_v4f32:
48; SSE2:       # %bb.0:
49; SSE2-NEXT:    addss %xmm1, %xmm0
50; SSE2-NEXT:    movaps %xmm1, %xmm2
51; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
52; SSE2-NEXT:    addss %xmm2, %xmm0
53; SSE2-NEXT:    movaps %xmm1, %xmm2
54; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
55; SSE2-NEXT:    addss %xmm2, %xmm0
56; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
57; SSE2-NEXT:    addss %xmm1, %xmm0
58; SSE2-NEXT:    retq
59;
60; SSE41-LABEL: test_v4f32:
61; SSE41:       # %bb.0:
62; SSE41-NEXT:    addss %xmm1, %xmm0
63; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
64; SSE41-NEXT:    addss %xmm2, %xmm0
65; SSE41-NEXT:    movaps %xmm1, %xmm2
66; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
67; SSE41-NEXT:    addss %xmm2, %xmm0
68; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
69; SSE41-NEXT:    addss %xmm1, %xmm0
70; SSE41-NEXT:    retq
71;
72; AVX-LABEL: test_v4f32:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
75; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
76; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
77; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
78; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
79; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
80; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
81; AVX-NEXT:    retq
82;
83; AVX512-LABEL: test_v4f32:
84; AVX512:       # %bb.0:
85; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
86; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
87; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
88; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
89; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
90; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
91; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
92; AVX512-NEXT:    retq
93  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
94  ret float %1
95}
96
97define float @test_v8f32(float %a0, <8 x float> %a1) {
98; SSE2-LABEL: test_v8f32:
99; SSE2:       # %bb.0:
100; SSE2-NEXT:    addss %xmm1, %xmm0
101; SSE2-NEXT:    movaps %xmm1, %xmm3
102; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
103; SSE2-NEXT:    addss %xmm3, %xmm0
104; SSE2-NEXT:    movaps %xmm1, %xmm3
105; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
106; SSE2-NEXT:    addss %xmm3, %xmm0
107; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
108; SSE2-NEXT:    addss %xmm1, %xmm0
109; SSE2-NEXT:    addss %xmm2, %xmm0
110; SSE2-NEXT:    movaps %xmm2, %xmm1
111; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
112; SSE2-NEXT:    addss %xmm1, %xmm0
113; SSE2-NEXT:    movaps %xmm2, %xmm1
114; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
115; SSE2-NEXT:    addss %xmm1, %xmm0
116; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
117; SSE2-NEXT:    addss %xmm2, %xmm0
118; SSE2-NEXT:    retq
119;
120; SSE41-LABEL: test_v8f32:
121; SSE41:       # %bb.0:
122; SSE41-NEXT:    addss %xmm1, %xmm0
123; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
124; SSE41-NEXT:    addss %xmm3, %xmm0
125; SSE41-NEXT:    movaps %xmm1, %xmm3
126; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
127; SSE41-NEXT:    addss %xmm3, %xmm0
128; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
129; SSE41-NEXT:    addss %xmm1, %xmm0
130; SSE41-NEXT:    addss %xmm2, %xmm0
131; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
132; SSE41-NEXT:    addss %xmm1, %xmm0
133; SSE41-NEXT:    movaps %xmm2, %xmm1
134; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
135; SSE41-NEXT:    addss %xmm1, %xmm0
136; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
137; SSE41-NEXT:    addss %xmm2, %xmm0
138; SSE41-NEXT:    retq
139;
140; AVX-LABEL: test_v8f32:
141; AVX:       # %bb.0:
142; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
143; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
144; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
145; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
146; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
147; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
148; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
149; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
150; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
151; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
152; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
153; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
154; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
155; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
156; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
157; AVX-NEXT:    vzeroupper
158; AVX-NEXT:    retq
159;
160; AVX512-LABEL: test_v8f32:
161; AVX512:       # %bb.0:
162; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
163; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
164; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
165; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
166; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
167; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
168; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
169; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
170; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
171; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
172; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
173; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
174; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
175; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
176; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
177; AVX512-NEXT:    vzeroupper
178; AVX512-NEXT:    retq
179  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
180  ret float %1
181}
182
183define float @test_v16f32(float %a0, <16 x float> %a1) {
184; SSE2-LABEL: test_v16f32:
185; SSE2:       # %bb.0:
186; SSE2-NEXT:    addss %xmm1, %xmm0
187; SSE2-NEXT:    movaps %xmm1, %xmm5
188; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
189; SSE2-NEXT:    addss %xmm5, %xmm0
190; SSE2-NEXT:    movaps %xmm1, %xmm5
191; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
192; SSE2-NEXT:    addss %xmm5, %xmm0
193; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
194; SSE2-NEXT:    addss %xmm1, %xmm0
195; SSE2-NEXT:    addss %xmm2, %xmm0
196; SSE2-NEXT:    movaps %xmm2, %xmm1
197; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
198; SSE2-NEXT:    addss %xmm1, %xmm0
199; SSE2-NEXT:    movaps %xmm2, %xmm1
200; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
201; SSE2-NEXT:    addss %xmm1, %xmm0
202; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
203; SSE2-NEXT:    addss %xmm2, %xmm0
204; SSE2-NEXT:    addss %xmm3, %xmm0
205; SSE2-NEXT:    movaps %xmm3, %xmm1
206; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
207; SSE2-NEXT:    addss %xmm1, %xmm0
208; SSE2-NEXT:    movaps %xmm3, %xmm1
209; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
210; SSE2-NEXT:    addss %xmm1, %xmm0
211; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
212; SSE2-NEXT:    addss %xmm3, %xmm0
213; SSE2-NEXT:    addss %xmm4, %xmm0
214; SSE2-NEXT:    movaps %xmm4, %xmm1
215; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
216; SSE2-NEXT:    addss %xmm1, %xmm0
217; SSE2-NEXT:    movaps %xmm4, %xmm1
218; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
219; SSE2-NEXT:    addss %xmm1, %xmm0
220; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
221; SSE2-NEXT:    addss %xmm4, %xmm0
222; SSE2-NEXT:    retq
223;
224; SSE41-LABEL: test_v16f32:
225; SSE41:       # %bb.0:
226; SSE41-NEXT:    addss %xmm1, %xmm0
227; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
228; SSE41-NEXT:    addss %xmm5, %xmm0
229; SSE41-NEXT:    movaps %xmm1, %xmm5
230; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
231; SSE41-NEXT:    addss %xmm5, %xmm0
232; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
233; SSE41-NEXT:    addss %xmm1, %xmm0
234; SSE41-NEXT:    addss %xmm2, %xmm0
235; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
236; SSE41-NEXT:    addss %xmm1, %xmm0
237; SSE41-NEXT:    movaps %xmm2, %xmm1
238; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
239; SSE41-NEXT:    addss %xmm1, %xmm0
240; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
241; SSE41-NEXT:    addss %xmm2, %xmm0
242; SSE41-NEXT:    addss %xmm3, %xmm0
243; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
244; SSE41-NEXT:    addss %xmm1, %xmm0
245; SSE41-NEXT:    movaps %xmm3, %xmm1
246; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
247; SSE41-NEXT:    addss %xmm1, %xmm0
248; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
249; SSE41-NEXT:    addss %xmm3, %xmm0
250; SSE41-NEXT:    addss %xmm4, %xmm0
251; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
252; SSE41-NEXT:    addss %xmm1, %xmm0
253; SSE41-NEXT:    movaps %xmm4, %xmm1
254; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
255; SSE41-NEXT:    addss %xmm1, %xmm0
256; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
257; SSE41-NEXT:    addss %xmm4, %xmm0
258; SSE41-NEXT:    retq
259;
260; AVX-LABEL: test_v16f32:
261; AVX:       # %bb.0:
262; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
263; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
264; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
265; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
266; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
267; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
268; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
269; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
270; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
271; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
272; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
273; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
274; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
275; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
276; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
277; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
278; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
279; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
280; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
281; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
282; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
283; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
284; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
285; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
286; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
287; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
288; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
289; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
290; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
291; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
292; AVX-NEXT:    vzeroupper
293; AVX-NEXT:    retq
294;
295; AVX512-LABEL: test_v16f32:
296; AVX512:       # %bb.0:
297; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
298; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
299; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
300; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
301; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
302; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
303; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
304; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
305; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
306; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
307; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
308; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
309; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
310; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
311; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
312; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
313; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
314; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
315; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
316; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
317; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
318; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
319; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
320; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
321; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
322; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
323; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
324; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
325; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
326; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
327; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
328; AVX512-NEXT:    vzeroupper
329; AVX512-NEXT:    retq
330  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
331  ret float %1
332}
333
334;
335; vXf32 (zero)
336;
337
338define float @test_v2f32_zero(<2 x float> %a0) {
339; SSE2-LABEL: test_v2f32_zero:
340; SSE2:       # %bb.0:
341; SSE2-NEXT:    movaps %xmm0, %xmm1
342; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
343; SSE2-NEXT:    addss %xmm1, %xmm0
344; SSE2-NEXT:    retq
345;
346; SSE41-LABEL: test_v2f32_zero:
347; SSE41:       # %bb.0:
348; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
349; SSE41-NEXT:    addss %xmm1, %xmm0
350; SSE41-NEXT:    retq
351;
352; AVX1-SLOW-LABEL: test_v2f32_zero:
353; AVX1-SLOW:       # %bb.0:
354; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
355; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
356; AVX1-SLOW-NEXT:    retq
357;
358; AVX1-FAST-LABEL: test_v2f32_zero:
359; AVX1-FAST:       # %bb.0:
360; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
361; AVX1-FAST-NEXT:    retq
362;
363; AVX2-LABEL: test_v2f32_zero:
364; AVX2:       # %bb.0:
365; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
366; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
367; AVX2-NEXT:    retq
368;
369; AVX512-LABEL: test_v2f32_zero:
370; AVX512:       # %bb.0:
371; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
372; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
373; AVX512-NEXT:    retq
374  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %a0)
375  ret float %1
376}
377
378define float @test_v4f32_zero(<4 x float> %a0) {
379; SSE2-LABEL: test_v4f32_zero:
380; SSE2:       # %bb.0:
381; SSE2-NEXT:    movaps %xmm0, %xmm1
382; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
383; SSE2-NEXT:    addss %xmm0, %xmm1
384; SSE2-NEXT:    movaps %xmm0, %xmm2
385; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
386; SSE2-NEXT:    addss %xmm1, %xmm2
387; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
388; SSE2-NEXT:    addss %xmm2, %xmm0
389; SSE2-NEXT:    retq
390;
391; SSE41-LABEL: test_v4f32_zero:
392; SSE41:       # %bb.0:
393; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
394; SSE41-NEXT:    addss %xmm0, %xmm1
395; SSE41-NEXT:    movaps %xmm0, %xmm2
396; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
397; SSE41-NEXT:    addss %xmm1, %xmm2
398; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
399; SSE41-NEXT:    addss %xmm2, %xmm0
400; SSE41-NEXT:    retq
401;
402; AVX1-SLOW-LABEL: test_v4f32_zero:
403; AVX1-SLOW:       # %bb.0:
404; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
405; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm1
406; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
407; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
408; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
409; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
410; AVX1-SLOW-NEXT:    retq
411;
412; AVX1-FAST-LABEL: test_v4f32_zero:
413; AVX1-FAST:       # %bb.0:
414; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
415; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
416; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
417; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
418; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm1, %xmm0
419; AVX1-FAST-NEXT:    retq
420;
421; AVX2-LABEL: test_v4f32_zero:
422; AVX2:       # %bb.0:
423; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
424; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm1
425; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
426; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
427; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
428; AVX2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
429; AVX2-NEXT:    retq
430;
431; AVX512-LABEL: test_v4f32_zero:
432; AVX512:       # %bb.0:
433; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
434; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
435; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
436; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
437; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
438; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
439; AVX512-NEXT:    retq
440  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0)
441  ret float %1
442}
443
444define float @test_v8f32_zero(<8 x float> %a0) {
445; SSE2-LABEL: test_v8f32_zero:
446; SSE2:       # %bb.0:
447; SSE2-NEXT:    movaps %xmm0, %xmm2
448; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
449; SSE2-NEXT:    addss %xmm0, %xmm2
450; SSE2-NEXT:    movaps %xmm0, %xmm3
451; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
452; SSE2-NEXT:    addss %xmm2, %xmm3
453; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
454; SSE2-NEXT:    addss %xmm3, %xmm0
455; SSE2-NEXT:    addss %xmm1, %xmm0
456; SSE2-NEXT:    movaps %xmm1, %xmm2
457; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
458; SSE2-NEXT:    addss %xmm2, %xmm0
459; SSE2-NEXT:    movaps %xmm1, %xmm2
460; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
461; SSE2-NEXT:    addss %xmm2, %xmm0
462; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
463; SSE2-NEXT:    addss %xmm1, %xmm0
464; SSE2-NEXT:    retq
465;
466; SSE41-LABEL: test_v8f32_zero:
467; SSE41:       # %bb.0:
468; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
469; SSE41-NEXT:    addss %xmm0, %xmm2
470; SSE41-NEXT:    movaps %xmm0, %xmm3
471; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
472; SSE41-NEXT:    addss %xmm2, %xmm3
473; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
474; SSE41-NEXT:    addss %xmm3, %xmm0
475; SSE41-NEXT:    addss %xmm1, %xmm0
476; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
477; SSE41-NEXT:    addss %xmm2, %xmm0
478; SSE41-NEXT:    movaps %xmm1, %xmm2
479; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
480; SSE41-NEXT:    addss %xmm2, %xmm0
481; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
482; SSE41-NEXT:    addss %xmm1, %xmm0
483; SSE41-NEXT:    retq
484;
485; AVX1-SLOW-LABEL: test_v8f32_zero:
486; AVX1-SLOW:       # %bb.0:
487; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
488; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm1
489; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
490; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
491; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
492; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
493; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
494; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm1
495; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
496; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
497; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
498; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
499; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
500; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
501; AVX1-SLOW-NEXT:    vzeroupper
502; AVX1-SLOW-NEXT:    retq
503;
504; AVX1-FAST-LABEL: test_v8f32_zero:
505; AVX1-FAST:       # %bb.0:
506; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
507; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
508; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
509; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
510; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
511; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
512; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm1, %xmm1
513; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
514; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
515; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
516; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
517; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
518; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm1, %xmm0
519; AVX1-FAST-NEXT:    vzeroupper
520; AVX1-FAST-NEXT:    retq
521;
522; AVX2-LABEL: test_v8f32_zero:
523; AVX2:       # %bb.0:
524; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
525; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm1
526; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
527; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
528; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
529; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
530; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
531; AVX2-NEXT:    vaddss %xmm0, %xmm1, %xmm1
532; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
533; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
534; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
535; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
536; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
537; AVX2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
538; AVX2-NEXT:    vzeroupper
539; AVX2-NEXT:    retq
540;
541; AVX512-LABEL: test_v8f32_zero:
542; AVX512:       # %bb.0:
543; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
544; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
545; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
546; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
547; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
548; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
549; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
550; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
551; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
552; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
553; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
554; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
555; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
556; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
557; AVX512-NEXT:    vzeroupper
558; AVX512-NEXT:    retq
559  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a0)
560  ret float %1
561}
562
563define float @test_v16f32_zero(<16 x float> %a0) {
564; SSE2-LABEL: test_v16f32_zero:
565; SSE2:       # %bb.0:
566; SSE2-NEXT:    movaps %xmm0, %xmm4
567; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
568; SSE2-NEXT:    addss %xmm0, %xmm4
569; SSE2-NEXT:    movaps %xmm0, %xmm5
570; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
571; SSE2-NEXT:    addss %xmm4, %xmm5
572; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
573; SSE2-NEXT:    addss %xmm5, %xmm0
574; SSE2-NEXT:    addss %xmm1, %xmm0
575; SSE2-NEXT:    movaps %xmm1, %xmm4
576; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
577; SSE2-NEXT:    addss %xmm4, %xmm0
578; SSE2-NEXT:    movaps %xmm1, %xmm4
579; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
580; SSE2-NEXT:    addss %xmm4, %xmm0
581; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
582; SSE2-NEXT:    addss %xmm1, %xmm0
583; SSE2-NEXT:    addss %xmm2, %xmm0
584; SSE2-NEXT:    movaps %xmm2, %xmm1
585; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
586; SSE2-NEXT:    addss %xmm1, %xmm0
587; SSE2-NEXT:    movaps %xmm2, %xmm1
588; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
589; SSE2-NEXT:    addss %xmm1, %xmm0
590; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
591; SSE2-NEXT:    addss %xmm2, %xmm0
592; SSE2-NEXT:    addss %xmm3, %xmm0
593; SSE2-NEXT:    movaps %xmm3, %xmm1
594; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
595; SSE2-NEXT:    addss %xmm1, %xmm0
596; SSE2-NEXT:    movaps %xmm3, %xmm1
597; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
598; SSE2-NEXT:    addss %xmm1, %xmm0
599; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
600; SSE2-NEXT:    addss %xmm3, %xmm0
601; SSE2-NEXT:    retq
602;
603; SSE41-LABEL: test_v16f32_zero:
604; SSE41:       # %bb.0:
605; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
606; SSE41-NEXT:    addss %xmm0, %xmm4
607; SSE41-NEXT:    movaps %xmm0, %xmm5
608; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
609; SSE41-NEXT:    addss %xmm4, %xmm5
610; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
611; SSE41-NEXT:    addss %xmm5, %xmm0
612; SSE41-NEXT:    addss %xmm1, %xmm0
613; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
614; SSE41-NEXT:    addss %xmm4, %xmm0
615; SSE41-NEXT:    movaps %xmm1, %xmm4
616; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
617; SSE41-NEXT:    addss %xmm4, %xmm0
618; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
619; SSE41-NEXT:    addss %xmm1, %xmm0
620; SSE41-NEXT:    addss %xmm2, %xmm0
621; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
622; SSE41-NEXT:    addss %xmm1, %xmm0
623; SSE41-NEXT:    movaps %xmm2, %xmm1
624; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
625; SSE41-NEXT:    addss %xmm1, %xmm0
626; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
627; SSE41-NEXT:    addss %xmm2, %xmm0
628; SSE41-NEXT:    addss %xmm3, %xmm0
629; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
630; SSE41-NEXT:    addss %xmm1, %xmm0
631; SSE41-NEXT:    movaps %xmm3, %xmm1
632; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
633; SSE41-NEXT:    addss %xmm1, %xmm0
634; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
635; SSE41-NEXT:    addss %xmm3, %xmm0
636; SSE41-NEXT:    retq
637;
638; AVX1-SLOW-LABEL: test_v16f32_zero:
639; AVX1-SLOW:       # %bb.0:
640; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
641; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm2
642; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
643; AVX1-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
644; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
645; AVX1-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
646; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
647; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm2
648; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
649; AVX1-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
650; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
651; AVX1-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
652; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
653; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm0
654; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
655; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
656; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
657; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
658; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
659; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
660; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
661; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
662; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
663; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
664; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
665; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
666; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm0
667; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
668; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
669; AVX1-SLOW-NEXT:    vzeroupper
670; AVX1-SLOW-NEXT:    retq
671;
672; AVX1-FAST-LABEL: test_v16f32_zero:
673; AVX1-FAST:       # %bb.0:
674; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm2
675; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
676; AVX1-FAST-NEXT:    vaddss %xmm3, %xmm2, %xmm2
677; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
678; AVX1-FAST-NEXT:    vaddss %xmm3, %xmm2, %xmm2
679; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
680; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm2, %xmm2
681; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
682; AVX1-FAST-NEXT:    vaddss %xmm3, %xmm2, %xmm2
683; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
684; AVX1-FAST-NEXT:    vaddss %xmm3, %xmm2, %xmm2
685; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
686; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm2, %xmm0
687; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
688; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
689; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
690; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
691; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
692; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
693; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
694; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
695; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
696; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
697; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
698; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
699; AVX1-FAST-NEXT:    vaddss %xmm2, %xmm0, %xmm0
700; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
701; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
702; AVX1-FAST-NEXT:    vzeroupper
703; AVX1-FAST-NEXT:    retq
704;
705; AVX2-LABEL: test_v16f32_zero:
706; AVX2:       # %bb.0:
707; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
708; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm2
709; AVX2-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
710; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
711; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
712; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
713; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
714; AVX2-NEXT:    vaddss %xmm0, %xmm2, %xmm2
715; AVX2-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
716; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
717; AVX2-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
718; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
719; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
720; AVX2-NEXT:    vaddss %xmm0, %xmm2, %xmm0
721; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
722; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
723; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
724; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
725; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
726; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
727; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
728; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
729; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
730; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
731; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
732; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
733; AVX2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
734; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
735; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
736; AVX2-NEXT:    vzeroupper
737; AVX2-NEXT:    retq
738;
739; AVX512-LABEL: test_v16f32_zero:
740; AVX512:       # %bb.0:
741; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
742; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
743; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
744; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
745; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
746; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
747; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
748; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
749; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
750; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
751; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
752; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
753; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
754; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
755; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
756; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
757; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
758; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
759; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
760; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
761; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
762; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
763; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
764; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
765; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
766; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
767; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
768; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
769; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
770; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
771; AVX512-NEXT:    vzeroupper
772; AVX512-NEXT:    retq
773  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float -0.0, <16 x float> %a0)
774  ret float %1
775}
776
777;
778; vXf32 (undef)
779;
780
781define float @test_v2f32_undef(<2 x float> %a0) {
782; SSE2-LABEL: test_v2f32_undef:
783; SSE2:       # %bb.0:
784; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
785; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
786; SSE2-NEXT:    retq
787;
788; SSE41-LABEL: test_v2f32_undef:
789; SSE41:       # %bb.0:
790; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
791; SSE41-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
792; SSE41-NEXT:    retq
793;
794; AVX-LABEL: test_v2f32_undef:
795; AVX:       # %bb.0:
796; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
797; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
798; AVX-NEXT:    retq
799;
800; AVX512-LABEL: test_v2f32_undef:
801; AVX512:       # %bb.0:
802; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
803; AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
804; AVX512-NEXT:    retq
805  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
806  ret float %1
807}
808
809define float @test_v4f32_undef(<4 x float> %a0) {
810; SSE2-LABEL: test_v4f32_undef:
811; SSE2:       # %bb.0:
812; SSE2-NEXT:    movaps %xmm0, %xmm1
813; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
814; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
815; SSE2-NEXT:    movaps %xmm0, %xmm2
816; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
817; SSE2-NEXT:    addss %xmm1, %xmm2
818; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
819; SSE2-NEXT:    addss %xmm2, %xmm0
820; SSE2-NEXT:    retq
821;
822; SSE41-LABEL: test_v4f32_undef:
823; SSE41:       # %bb.0:
824; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
825; SSE41-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
826; SSE41-NEXT:    movaps %xmm0, %xmm2
827; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
828; SSE41-NEXT:    addss %xmm1, %xmm2
829; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
830; SSE41-NEXT:    addss %xmm2, %xmm0
831; SSE41-NEXT:    retq
832;
833; AVX-LABEL: test_v4f32_undef:
834; AVX:       # %bb.0:
835; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
836; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
837; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
838; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
839; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
840; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
841; AVX-NEXT:    retq
842;
843; AVX512-LABEL: test_v4f32_undef:
844; AVX512:       # %bb.0:
845; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
846; AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
847; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
848; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
849; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
850; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
851; AVX512-NEXT:    retq
852  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
853  ret float %1
854}
855
856define float @test_v8f32_undef(<8 x float> %a0) {
857; SSE2-LABEL: test_v8f32_undef:
858; SSE2:       # %bb.0:
859; SSE2-NEXT:    movaps %xmm0, %xmm2
860; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
861; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
862; SSE2-NEXT:    movaps %xmm0, %xmm3
863; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
864; SSE2-NEXT:    addss %xmm2, %xmm3
865; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
866; SSE2-NEXT:    addss %xmm3, %xmm0
867; SSE2-NEXT:    addss %xmm1, %xmm0
868; SSE2-NEXT:    movaps %xmm1, %xmm2
869; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
870; SSE2-NEXT:    addss %xmm2, %xmm0
871; SSE2-NEXT:    movaps %xmm1, %xmm2
872; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
873; SSE2-NEXT:    addss %xmm2, %xmm0
874; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
875; SSE2-NEXT:    addss %xmm1, %xmm0
876; SSE2-NEXT:    retq
877;
878; SSE41-LABEL: test_v8f32_undef:
879; SSE41:       # %bb.0:
880; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
881; SSE41-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
882; SSE41-NEXT:    movaps %xmm0, %xmm3
883; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
884; SSE41-NEXT:    addss %xmm2, %xmm3
885; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
886; SSE41-NEXT:    addss %xmm3, %xmm0
887; SSE41-NEXT:    addss %xmm1, %xmm0
888; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
889; SSE41-NEXT:    addss %xmm2, %xmm0
890; SSE41-NEXT:    movaps %xmm1, %xmm2
891; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
892; SSE41-NEXT:    addss %xmm2, %xmm0
893; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
894; SSE41-NEXT:    addss %xmm1, %xmm0
895; SSE41-NEXT:    retq
896;
897; AVX-LABEL: test_v8f32_undef:
898; AVX:       # %bb.0:
899; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
900; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
901; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
902; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
903; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
904; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
905; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
906; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm1
907; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
908; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
909; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
910; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
911; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
912; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
913; AVX-NEXT:    vzeroupper
914; AVX-NEXT:    retq
915;
916; AVX512-LABEL: test_v8f32_undef:
917; AVX512:       # %bb.0:
918; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
919; AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
920; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
921; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
922; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
923; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
924; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
925; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
926; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
927; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
928; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
929; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
930; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
931; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
932; AVX512-NEXT:    vzeroupper
933; AVX512-NEXT:    retq
934  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
935  ret float %1
936}
937
938define float @test_v16f32_undef(<16 x float> %a0) {
939; SSE2-LABEL: test_v16f32_undef:
940; SSE2:       # %bb.0:
941; SSE2-NEXT:    movaps %xmm0, %xmm4
942; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
943; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
944; SSE2-NEXT:    movaps %xmm0, %xmm5
945; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
946; SSE2-NEXT:    addss %xmm4, %xmm5
947; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
948; SSE2-NEXT:    addss %xmm5, %xmm0
949; SSE2-NEXT:    addss %xmm1, %xmm0
950; SSE2-NEXT:    movaps %xmm1, %xmm4
951; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
952; SSE2-NEXT:    addss %xmm4, %xmm0
953; SSE2-NEXT:    movaps %xmm1, %xmm4
954; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
955; SSE2-NEXT:    addss %xmm4, %xmm0
956; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
957; SSE2-NEXT:    addss %xmm1, %xmm0
958; SSE2-NEXT:    addss %xmm2, %xmm0
959; SSE2-NEXT:    movaps %xmm2, %xmm1
960; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
961; SSE2-NEXT:    addss %xmm1, %xmm0
962; SSE2-NEXT:    movaps %xmm2, %xmm1
963; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
964; SSE2-NEXT:    addss %xmm1, %xmm0
965; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
966; SSE2-NEXT:    addss %xmm2, %xmm0
967; SSE2-NEXT:    addss %xmm3, %xmm0
968; SSE2-NEXT:    movaps %xmm3, %xmm1
969; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
970; SSE2-NEXT:    addss %xmm1, %xmm0
971; SSE2-NEXT:    movaps %xmm3, %xmm1
972; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
973; SSE2-NEXT:    addss %xmm1, %xmm0
974; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
975; SSE2-NEXT:    addss %xmm3, %xmm0
976; SSE2-NEXT:    retq
977;
978; SSE41-LABEL: test_v16f32_undef:
979; SSE41:       # %bb.0:
980; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
981; SSE41-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
982; SSE41-NEXT:    movaps %xmm0, %xmm5
983; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
984; SSE41-NEXT:    addss %xmm4, %xmm5
985; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
986; SSE41-NEXT:    addss %xmm5, %xmm0
987; SSE41-NEXT:    addss %xmm1, %xmm0
988; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
989; SSE41-NEXT:    addss %xmm4, %xmm0
990; SSE41-NEXT:    movaps %xmm1, %xmm4
991; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
992; SSE41-NEXT:    addss %xmm4, %xmm0
993; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
994; SSE41-NEXT:    addss %xmm1, %xmm0
995; SSE41-NEXT:    addss %xmm2, %xmm0
996; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
997; SSE41-NEXT:    addss %xmm1, %xmm0
998; SSE41-NEXT:    movaps %xmm2, %xmm1
999; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1000; SSE41-NEXT:    addss %xmm1, %xmm0
1001; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1002; SSE41-NEXT:    addss %xmm2, %xmm0
1003; SSE41-NEXT:    addss %xmm3, %xmm0
1004; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
1005; SSE41-NEXT:    addss %xmm1, %xmm0
1006; SSE41-NEXT:    movaps %xmm3, %xmm1
1007; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1008; SSE41-NEXT:    addss %xmm1, %xmm0
1009; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
1010; SSE41-NEXT:    addss %xmm3, %xmm0
1011; SSE41-NEXT:    retq
1012;
1013; AVX-LABEL: test_v16f32_undef:
1014; AVX:       # %bb.0:
1015; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
1016; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1017; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
1018; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
1019; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
1020; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
1021; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1022; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm2
1023; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
1024; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
1025; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
1026; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
1027; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1028; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm0
1029; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1030; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1031; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1032; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1033; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1034; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
1035; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1036; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1037; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1038; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1039; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1040; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1041; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
1042; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
1043; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1044; AVX-NEXT:    vzeroupper
1045; AVX-NEXT:    retq
1046;
1047; AVX512-LABEL: test_v16f32_undef:
1048; AVX512:       # %bb.0:
1049; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1050; AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1051; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1052; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1053; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
1054; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1055; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1056; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1057; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
1058; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
1059; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
1060; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
1061; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1062; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1063; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1064; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1065; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
1066; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
1067; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
1068; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
1069; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1070; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1071; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1072; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
1073; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
1074; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1075; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1076; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1077; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1078; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1079; AVX512-NEXT:    vzeroupper
1080; AVX512-NEXT:    retq
1081  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
1082  ret float %1
1083}
1084
1085;
1086; vXf64 (accum)
1087;
1088
1089define double @test_v2f64(double %a0, <2 x double> %a1) {
1090; SSE-LABEL: test_v2f64:
1091; SSE:       # %bb.0:
1092; SSE-NEXT:    addsd %xmm1, %xmm0
1093; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1094; SSE-NEXT:    addsd %xmm1, %xmm0
1095; SSE-NEXT:    retq
1096;
1097; AVX-LABEL: test_v2f64:
1098; AVX:       # %bb.0:
1099; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1100; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1101; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1102; AVX-NEXT:    retq
1103;
1104; AVX512-LABEL: test_v2f64:
1105; AVX512:       # %bb.0:
1106; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1107; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1108; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1109; AVX512-NEXT:    retq
1110  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
1111  ret double %1
1112}
1113
1114define double @test_v4f64(double %a0, <4 x double> %a1) {
1115; SSE-LABEL: test_v4f64:
1116; SSE:       # %bb.0:
1117; SSE-NEXT:    addsd %xmm1, %xmm0
1118; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1119; SSE-NEXT:    addsd %xmm1, %xmm0
1120; SSE-NEXT:    addsd %xmm2, %xmm0
1121; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1122; SSE-NEXT:    addsd %xmm2, %xmm0
1123; SSE-NEXT:    retq
1124;
1125; AVX-LABEL: test_v4f64:
1126; AVX:       # %bb.0:
1127; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1128; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1129; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1130; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1131; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1132; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1133; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1134; AVX-NEXT:    vzeroupper
1135; AVX-NEXT:    retq
1136;
1137; AVX512-LABEL: test_v4f64:
1138; AVX512:       # %bb.0:
1139; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1140; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1141; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1142; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
1143; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1144; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1145; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1146; AVX512-NEXT:    vzeroupper
1147; AVX512-NEXT:    retq
1148  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1149  ret double %1
1150}
1151
1152define double @test_v8f64(double %a0, <8 x double> %a1) {
1153; SSE-LABEL: test_v8f64:
1154; SSE:       # %bb.0:
1155; SSE-NEXT:    addsd %xmm1, %xmm0
1156; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1157; SSE-NEXT:    addsd %xmm1, %xmm0
1158; SSE-NEXT:    addsd %xmm2, %xmm0
1159; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1160; SSE-NEXT:    addsd %xmm2, %xmm0
1161; SSE-NEXT:    addsd %xmm3, %xmm0
1162; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1163; SSE-NEXT:    addsd %xmm3, %xmm0
1164; SSE-NEXT:    addsd %xmm4, %xmm0
1165; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1166; SSE-NEXT:    addsd %xmm4, %xmm0
1167; SSE-NEXT:    retq
1168;
1169; AVX-LABEL: test_v8f64:
1170; AVX:       # %bb.0:
1171; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1172; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
1173; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1174; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1175; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1176; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1177; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1178; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1179; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1180; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1181; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1182; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1183; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1184; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1185; AVX-NEXT:    vzeroupper
1186; AVX-NEXT:    retq
1187;
1188; AVX512-LABEL: test_v8f64:
1189; AVX512:       # %bb.0:
1190; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1191; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1192; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1193; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1194; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1195; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1196; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1197; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1198; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1199; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1200; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1201; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1202; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1203; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1204; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1205; AVX512-NEXT:    vzeroupper
1206; AVX512-NEXT:    retq
1207  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
1208  ret double %1
1209}
1210
1211define double @test_v16f64(double %a0, <16 x double> %a1) {
1212; SSE2-LABEL: test_v16f64:
1213; SSE2:       # %bb.0:
1214; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
1215; SSE2-NEXT:    addsd %xmm1, %xmm0
1216; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1217; SSE2-NEXT:    addsd %xmm1, %xmm0
1218; SSE2-NEXT:    addsd %xmm2, %xmm0
1219; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1220; SSE2-NEXT:    addsd %xmm2, %xmm0
1221; SSE2-NEXT:    addsd %xmm3, %xmm0
1222; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1223; SSE2-NEXT:    addsd %xmm3, %xmm0
1224; SSE2-NEXT:    addsd %xmm4, %xmm0
1225; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1226; SSE2-NEXT:    addsd %xmm4, %xmm0
1227; SSE2-NEXT:    addsd %xmm5, %xmm0
1228; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1229; SSE2-NEXT:    addsd %xmm5, %xmm0
1230; SSE2-NEXT:    addsd %xmm6, %xmm0
1231; SSE2-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1232; SSE2-NEXT:    addsd %xmm6, %xmm0
1233; SSE2-NEXT:    addsd %xmm7, %xmm0
1234; SSE2-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1235; SSE2-NEXT:    addsd %xmm7, %xmm0
1236; SSE2-NEXT:    addsd %xmm8, %xmm0
1237; SSE2-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
1238; SSE2-NEXT:    addsd %xmm8, %xmm0
1239; SSE2-NEXT:    retq
1240;
1241; SSE41-LABEL: test_v16f64:
1242; SSE41:       # %bb.0:
1243; SSE41-NEXT:    addsd %xmm1, %xmm0
1244; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1245; SSE41-NEXT:    addsd %xmm1, %xmm0
1246; SSE41-NEXT:    addsd %xmm2, %xmm0
1247; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1248; SSE41-NEXT:    addsd %xmm2, %xmm0
1249; SSE41-NEXT:    addsd %xmm3, %xmm0
1250; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1251; SSE41-NEXT:    addsd %xmm3, %xmm0
1252; SSE41-NEXT:    addsd %xmm4, %xmm0
1253; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1254; SSE41-NEXT:    addsd %xmm4, %xmm0
1255; SSE41-NEXT:    addsd %xmm5, %xmm0
1256; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1257; SSE41-NEXT:    addsd %xmm5, %xmm0
1258; SSE41-NEXT:    addsd %xmm6, %xmm0
1259; SSE41-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1260; SSE41-NEXT:    addsd %xmm6, %xmm0
1261; SSE41-NEXT:    addsd %xmm7, %xmm0
1262; SSE41-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1263; SSE41-NEXT:    addsd %xmm7, %xmm0
1264; SSE41-NEXT:    addsd {{[0-9]+}}(%rsp), %xmm0
1265; SSE41-NEXT:    addsd {{[0-9]+}}(%rsp), %xmm0
1266; SSE41-NEXT:    retq
1267;
1268; AVX-LABEL: test_v16f64:
1269; AVX:       # %bb.0:
1270; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1271; AVX-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
1272; AVX-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
1273; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1274; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1275; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1276; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1277; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1278; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1279; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1280; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1281; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1282; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1283; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1284; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1285; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
1286; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1287; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1288; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1289; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1290; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1291; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1292; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm4[1,0]
1293; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1294; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
1295; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1296; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1297; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1298; AVX-NEXT:    vzeroupper
1299; AVX-NEXT:    retq
1300;
1301; AVX512-LABEL: test_v16f64:
1302; AVX512:       # %bb.0:
1303; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1304; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
1305; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1306; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
1307; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1308; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1309; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1310; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
1311; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1312; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1313; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1314; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1315; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1316; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1317; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1318; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1319; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1320; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1321; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
1322; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1323; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1324; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1325; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
1326; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1327; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1328; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1329; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm1
1330; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1331; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1332; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1333; AVX512-NEXT:    vzeroupper
1334; AVX512-NEXT:    retq
1335  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
1336  ret double %1
1337}
1338
1339;
1340; vXf64 (zero)
1341;
1342
1343define double @test_v2f64_zero(<2 x double> %a0) {
1344; SSE-LABEL: test_v2f64_zero:
1345; SSE:       # %bb.0:
1346; SSE-NEXT:    movapd %xmm0, %xmm1
1347; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1348; SSE-NEXT:    addsd %xmm1, %xmm0
1349; SSE-NEXT:    retq
1350;
1351; AVX1-SLOW-LABEL: test_v2f64_zero:
1352; AVX1-SLOW:       # %bb.0:
1353; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1354; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1355; AVX1-SLOW-NEXT:    retq
1356;
1357; AVX1-FAST-LABEL: test_v2f64_zero:
1358; AVX1-FAST:       # %bb.0:
1359; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1360; AVX1-FAST-NEXT:    retq
1361;
1362; AVX2-LABEL: test_v2f64_zero:
1363; AVX2:       # %bb.0:
1364; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1365; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1366; AVX2-NEXT:    retq
1367;
1368; AVX512-LABEL: test_v2f64_zero:
1369; AVX512:       # %bb.0:
1370; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1371; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1372; AVX512-NEXT:    retq
1373  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %a0)
1374  ret double %1
1375}
1376
1377define double @test_v4f64_zero(<4 x double> %a0) {
1378; SSE-LABEL: test_v4f64_zero:
1379; SSE:       # %bb.0:
1380; SSE-NEXT:    movapd %xmm0, %xmm2
1381; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
1382; SSE-NEXT:    addsd %xmm2, %xmm0
1383; SSE-NEXT:    addsd %xmm1, %xmm0
1384; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1385; SSE-NEXT:    addsd %xmm1, %xmm0
1386; SSE-NEXT:    retq
1387;
1388; AVX1-SLOW-LABEL: test_v4f64_zero:
1389; AVX1-SLOW:       # %bb.0:
1390; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1391; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1392; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1393; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1394; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1395; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1396; AVX1-SLOW-NEXT:    vzeroupper
1397; AVX1-SLOW-NEXT:    retq
1398;
1399; AVX1-FAST-LABEL: test_v4f64_zero:
1400; AVX1-FAST:       # %bb.0:
1401; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm1
1402; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1403; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1404; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1405; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1406; AVX1-FAST-NEXT:    vzeroupper
1407; AVX1-FAST-NEXT:    retq
1408;
1409; AVX2-LABEL: test_v4f64_zero:
1410; AVX2:       # %bb.0:
1411; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1412; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1413; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1414; AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1415; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1416; AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1417; AVX2-NEXT:    vzeroupper
1418; AVX2-NEXT:    retq
1419;
1420; AVX512-LABEL: test_v4f64_zero:
1421; AVX512:       # %bb.0:
1422; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1423; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1424; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1425; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1426; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1427; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1428; AVX512-NEXT:    vzeroupper
1429; AVX512-NEXT:    retq
1430  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a0)
1431  ret double %1
1432}
1433
1434define double @test_v8f64_zero(<8 x double> %a0) {
1435; SSE-LABEL: test_v8f64_zero:
1436; SSE:       # %bb.0:
1437; SSE-NEXT:    movapd %xmm0, %xmm4
1438; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
1439; SSE-NEXT:    addsd %xmm4, %xmm0
1440; SSE-NEXT:    addsd %xmm1, %xmm0
1441; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1442; SSE-NEXT:    addsd %xmm1, %xmm0
1443; SSE-NEXT:    addsd %xmm2, %xmm0
1444; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1445; SSE-NEXT:    addsd %xmm2, %xmm0
1446; SSE-NEXT:    addsd %xmm3, %xmm0
1447; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1448; SSE-NEXT:    addsd %xmm3, %xmm0
1449; SSE-NEXT:    retq
1450;
1451; AVX1-SLOW-LABEL: test_v8f64_zero:
1452; AVX1-SLOW:       # %bb.0:
1453; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1454; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
1455; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1456; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1457; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1458; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1459; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1460; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1461; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1462; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
1463; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1464; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1465; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1466; AVX1-SLOW-NEXT:    vzeroupper
1467; AVX1-SLOW-NEXT:    retq
1468;
1469; AVX1-FAST-LABEL: test_v8f64_zero:
1470; AVX1-FAST:       # %bb.0:
1471; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm2
1472; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1473; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1474; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1475; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1476; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1477; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1478; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1479; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
1480; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1481; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1482; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1483; AVX1-FAST-NEXT:    vzeroupper
1484; AVX1-FAST-NEXT:    retq
1485;
1486; AVX2-LABEL: test_v8f64_zero:
1487; AVX2:       # %bb.0:
1488; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1489; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
1490; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1491; AVX2-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1492; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1493; AVX2-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1494; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1495; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1496; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1497; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
1498; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1499; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1500; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1501; AVX2-NEXT:    vzeroupper
1502; AVX2-NEXT:    retq
1503;
1504; AVX512-LABEL: test_v8f64_zero:
1505; AVX512:       # %bb.0:
1506; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1507; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
1508; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1509; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1510; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1511; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1512; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1513; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1514; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1515; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1516; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1517; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1518; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1519; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1520; AVX512-NEXT:    vzeroupper
1521; AVX512-NEXT:    retq
1522  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double -0.0, <8 x double> %a0)
1523  ret double %1
1524}
1525
1526define double @test_v16f64_zero(<16 x double> %a0) {
1527; SSE-LABEL: test_v16f64_zero:
1528; SSE:       # %bb.0:
1529; SSE-NEXT:    movapd %xmm0, %xmm8
1530; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
1531; SSE-NEXT:    addsd %xmm8, %xmm0
1532; SSE-NEXT:    addsd %xmm1, %xmm0
1533; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1534; SSE-NEXT:    addsd %xmm1, %xmm0
1535; SSE-NEXT:    addsd %xmm2, %xmm0
1536; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1537; SSE-NEXT:    addsd %xmm2, %xmm0
1538; SSE-NEXT:    addsd %xmm3, %xmm0
1539; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1540; SSE-NEXT:    addsd %xmm3, %xmm0
1541; SSE-NEXT:    addsd %xmm4, %xmm0
1542; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1543; SSE-NEXT:    addsd %xmm4, %xmm0
1544; SSE-NEXT:    addsd %xmm5, %xmm0
1545; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1546; SSE-NEXT:    addsd %xmm5, %xmm0
1547; SSE-NEXT:    addsd %xmm6, %xmm0
1548; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1549; SSE-NEXT:    addsd %xmm6, %xmm0
1550; SSE-NEXT:    addsd %xmm7, %xmm0
1551; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1552; SSE-NEXT:    addsd %xmm7, %xmm0
1553; SSE-NEXT:    retq
1554;
1555; AVX1-SLOW-LABEL: test_v16f64_zero:
1556; AVX1-SLOW:       # %bb.0:
1557; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1558; AVX1-SLOW-NEXT:    vaddsd %xmm4, %xmm0, %xmm4
1559; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1560; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1561; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1562; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1563; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1564; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1565; AVX1-SLOW-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1566; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
1567; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1568; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1569; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1570; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1571; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1572; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1573; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm1
1574; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1575; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1576; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1577; AVX1-SLOW-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1578; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
1579; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1580; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm1
1581; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1582; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1583; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1584; AVX1-SLOW-NEXT:    vzeroupper
1585; AVX1-SLOW-NEXT:    retq
1586;
1587; AVX1-FAST-LABEL: test_v16f64_zero:
1588; AVX1-FAST:       # %bb.0:
1589; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm4
1590; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1591; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1592; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1593; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1594; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1595; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1596; AVX1-FAST-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1597; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
1598; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1599; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1600; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1601; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1602; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1603; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1604; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm1
1605; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1606; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1607; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1608; AVX1-FAST-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1609; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
1610; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1611; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm1
1612; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1613; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1614; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1615; AVX1-FAST-NEXT:    vzeroupper
1616; AVX1-FAST-NEXT:    retq
1617;
1618; AVX2-LABEL: test_v16f64_zero:
1619; AVX2:       # %bb.0:
1620; AVX2-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1621; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm4
1622; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1623; AVX2-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1624; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1625; AVX2-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1626; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1627; AVX2-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1628; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1629; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
1630; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1631; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1632; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1633; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1634; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1635; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1636; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
1637; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1638; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1639; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1640; AVX2-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1641; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
1642; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1643; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm1
1644; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1645; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1646; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1647; AVX2-NEXT:    vzeroupper
1648; AVX2-NEXT:    retq
1649;
1650; AVX512-LABEL: test_v16f64_zero:
1651; AVX512:       # %bb.0:
1652; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1653; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
1654; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1655; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1656; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1657; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1658; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1659; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1660; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1661; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1662; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1663; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1664; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1665; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1666; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1667; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1668; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1669; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1670; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1671; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1672; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1673; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1674; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1675; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1676; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1677; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1678; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1679; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1680; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1681; AVX512-NEXT:    vzeroupper
1682; AVX512-NEXT:    retq
1683  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double -0.0, <16 x double> %a0)
1684  ret double %1
1685}
1686
1687;
1688; vXf64 (undef)
1689;
1690
1691define double @test_v2f64_undef(<2 x double> %a0) {
1692; SSE-LABEL: test_v2f64_undef:
1693; SSE:       # %bb.0:
1694; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1695; SSE-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1696; SSE-NEXT:    retq
1697;
1698; AVX-LABEL: test_v2f64_undef:
1699; AVX:       # %bb.0:
1700; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1701; AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1702; AVX-NEXT:    retq
1703;
1704; AVX512-LABEL: test_v2f64_undef:
1705; AVX512:       # %bb.0:
1706; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1707; AVX512-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1708; AVX512-NEXT:    retq
1709  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
1710  ret double %1
1711}
1712
1713define double @test_v4f64_undef(<4 x double> %a0) {
1714; SSE-LABEL: test_v4f64_undef:
1715; SSE:       # %bb.0:
1716; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1717; SSE-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1718; SSE-NEXT:    addsd %xmm1, %xmm0
1719; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1720; SSE-NEXT:    addsd %xmm1, %xmm0
1721; SSE-NEXT:    retq
1722;
1723; AVX-LABEL: test_v4f64_undef:
1724; AVX:       # %bb.0:
1725; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1726; AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1727; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1728; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1729; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1730; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1731; AVX-NEXT:    vzeroupper
1732; AVX-NEXT:    retq
1733;
1734; AVX512-LABEL: test_v4f64_undef:
1735; AVX512:       # %bb.0:
1736; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1737; AVX512-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1738; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1739; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1740; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1741; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1742; AVX512-NEXT:    vzeroupper
1743; AVX512-NEXT:    retq
1744  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
1745  ret double %1
1746}
1747
1748define double @test_v8f64_undef(<8 x double> %a0) {
1749; SSE-LABEL: test_v8f64_undef:
1750; SSE:       # %bb.0:
1751; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1752; SSE-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1753; SSE-NEXT:    addsd %xmm1, %xmm0
1754; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1755; SSE-NEXT:    addsd %xmm1, %xmm0
1756; SSE-NEXT:    addsd %xmm2, %xmm0
1757; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1758; SSE-NEXT:    addsd %xmm2, %xmm0
1759; SSE-NEXT:    addsd %xmm3, %xmm0
1760; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1761; SSE-NEXT:    addsd %xmm3, %xmm0
1762; SSE-NEXT:    retq
1763;
1764; AVX-LABEL: test_v8f64_undef:
1765; AVX:       # %bb.0:
1766; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1767; AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1768; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1769; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1770; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1771; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1772; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1773; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1774; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1775; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1776; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1777; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1778; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1779; AVX-NEXT:    vzeroupper
1780; AVX-NEXT:    retq
1781;
1782; AVX512-LABEL: test_v8f64_undef:
1783; AVX512:       # %bb.0:
1784; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1785; AVX512-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1786; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1787; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1788; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1789; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1790; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1791; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1792; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1793; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1794; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1795; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
1796; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1797; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1798; AVX512-NEXT:    vzeroupper
1799; AVX512-NEXT:    retq
1800  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
1801  ret double %1
1802}
1803
1804define double @test_v16f64_undef(<16 x double> %a0) {
1805; SSE-LABEL: test_v16f64_undef:
1806; SSE:       # %bb.0:
1807; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1808; SSE-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1809; SSE-NEXT:    addsd %xmm1, %xmm0
1810; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1811; SSE-NEXT:    addsd %xmm1, %xmm0
1812; SSE-NEXT:    addsd %xmm2, %xmm0
1813; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1814; SSE-NEXT:    addsd %xmm2, %xmm0
1815; SSE-NEXT:    addsd %xmm3, %xmm0
1816; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1817; SSE-NEXT:    addsd %xmm3, %xmm0
1818; SSE-NEXT:    addsd %xmm4, %xmm0
1819; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1820; SSE-NEXT:    addsd %xmm4, %xmm0
1821; SSE-NEXT:    addsd %xmm5, %xmm0
1822; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1823; SSE-NEXT:    addsd %xmm5, %xmm0
1824; SSE-NEXT:    addsd %xmm6, %xmm0
1825; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1826; SSE-NEXT:    addsd %xmm6, %xmm0
1827; SSE-NEXT:    addsd %xmm7, %xmm0
1828; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1829; SSE-NEXT:    addsd %xmm7, %xmm0
1830; SSE-NEXT:    retq
1831;
1832; AVX-LABEL: test_v16f64_undef:
1833; AVX:       # %bb.0:
1834; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1835; AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
1836; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1837; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
1838; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1839; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
1840; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1841; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1842; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
1843; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1844; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1845; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1846; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1847; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1848; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1849; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1850; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1851; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1852; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1853; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1854; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
1855; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
1856; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1857; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1858; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1859; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1860; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1861; AVX-NEXT:    vzeroupper
1862; AVX-NEXT:    retq
1863;
1864; AVX512-LABEL: test_v16f64_undef:
1865; AVX512:       # %bb.0:
1866; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1867; AVX512-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1868; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1869; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1870; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1871; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1872; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1873; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1874; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1875; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
1876; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1877; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
1878; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1879; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
1880; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1881; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1882; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1883; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1884; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1885; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1886; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1887; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1888; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1889; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1890; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
1891; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1892; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1893; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1894; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1895; AVX512-NEXT:    vzeroupper
1896; AVX512-NEXT:    retq
1897  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
1898  ret double %1
1899}
1900
1901define float @PR64627() {
1902; SSE-LABEL: PR64627:
1903; SSE:       # %bb.0:
1904; SSE-NEXT:    movss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0]
1905; SSE-NEXT:    retq
1906;
1907; AVX-LABEL: PR64627:
1908; AVX:       # %bb.0:
1909; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0]
1910; AVX-NEXT:    retq
1911;
1912; AVX512-LABEL: PR64627:
1913; AVX512:       # %bb.0:
1914; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0]
1915; AVX512-NEXT:    retq
1916  %1 = bitcast i5 0 to <5 x i1>
1917  %2 = select <5 x i1> %1, <5 x float> zeroinitializer, <5 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
1918  %3 = call float @llvm.vector.reduce.fadd.v5f32(float -0.0, <5 x float> %2)
1919  ret float %3
1920}
1921declare float @llvm.vector.reduce.fadd.v5f32(float, <5 x float>)
1922
1923declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
1924declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
1925declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1926declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
1927
1928declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
1929declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
1930declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
1931declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
1932